^tools/blktap/parallax/vdi_validate$
^tools/blktap/parallax/parallax$
^tools/blktap/parallax/blockstored$
+^tools/blktap/ublkback/ublkback$
^tools/blktap/xen/.*$
^tools/check/\..*$
^tools/cmdline/.*$
-obj-y := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o
+obj-y := xenbus.o interface.o blktap.o
/******************************************************************************
- * blktap.c
+ * arch/xen/drivers/blkif/blktap/blktap.c
*
- * XenLinux virtual block-device tap.
+ * This is a modified version of the block backend driver that remaps requests
+ * to a user-space memory region. It is intended to be used to write
+ * application-level servers that provide block interfaces to client VMs.
*
- * Copyright (c) 2004, Andrew Warfield
- *
- * Based on the original split block driver:
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- * Copyright (c) 2004, Christian Limpach
- *
- * Note that unlike the split block driver code, this driver has been developed
- * strictly for Linux 2.6
*/
-#include "blktap.h"
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <asm-xen/balloon.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/tlbflush.h>
+#include "common.h"
+
+/* Only one process may open /dev/xen/blktap at any time. */
+static unsigned long blktap_dev_inuse;
+unsigned long blktap_ring_ok; /* make this ring->state */
+
+/* Rings up to user space. */
+static blkif_front_ring_t blktap_ufe_ring;
+
+/* for poll: */
+static wait_queue_head_t blktap_wait;
+
+/* current switching mode */
+static unsigned long blktap_mode;
+
+/* local prototypes */
+static int blktap_read_ufe_ring(void);
+
+
+/* /dev/xen/blktap resides at device number major=10, minor=200 */
+#define BLKTAP_MINOR 202
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE 1
+#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE 3
+#define BLKTAP_IOCTL_PRINT_IDXS 100
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */
+#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */
+#define BLKTAP_MODE_INTERCEPT_FE 0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */
+#define BLKTAP_MODE_COPY_FE 0x00000004 /* unimp. */
+#define BLKTAP_MODE_COPY_BE 0x00000008 /* unimp. */
+#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 /* unimp. */
+#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 /* unimp. */
+
+#define BLKTAP_MODE_INTERPOSE \
+ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+#define BLKTAP_MODE_COPY_BOTH \
+ (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
+
+#define BLKTAP_MODE_COPY_BOTH_PAGES \
+ (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
-int __init xlblktap_init(void)
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t fe_st;
- blkif_be_driver_status_t be_st;
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) );
+/*
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
+ ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
+ );
+*/
+}
+
+
+/******************************************************************
+ * MMAP REGION
+ */
+
+/*
+ * We use a big chunk of address space to map in-flight requests into,
+ * and export this region up to user-space. See the comments in blkback
+ * about this -- the two must be kept in sync if the tap is used as a
+ * passthrough.
+ */
+
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
- printk(KERN_INFO "Initialising Xen block tap device\n");
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- printk(KERN_INFO "Block tap is using grant tables.\n");
-#endif
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+#define RING_PAGES 1 /* Front */
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma = NULL;
+unsigned long mmap_vstart; /* Kernel pages for mapping in data. */
+unsigned long rings_vstart; /* start of mmaped vma */
+unsigned long user_vstart; /* start of user mappings */
- DPRINTK(" tap - Backend connection init:\n");
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg) \
+ (_start + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * PAGE_SIZE))
- (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- fe_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ unsigned long id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of
+ * order. We therefore maintain an allocation ring. This ring also indicates
+ * when enough work has been passed down -- at that point the allocation ring
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Requests passing through the tap to the backend hijack the id field
+ * in the request message. In it we put the AR index _AND_ the fe domid.
+ * the domid is used by the backend to map the pages properly.
+ */
- DPRINTK(" tap - Frontend connection init:\n");
+static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
+{
+ return ( (fe_dom << 16) | MASK_PEND_IDX(idx) );
+}
+
+extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id)
+{
+ return (PEND_RING_IDX)( id & 0x0000ffff );
+}
+
+extern inline domid_t ID_TO_DOM(unsigned long id)
+{
+ return (domid_t)(id >> 16);
+}
+
+
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+ u16 kernel;
+ u16 user;
+};
+static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
+#define pending_handle(_idx, _i) \
+ (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
+#define BLKTAP_INVALID_HANDLE(_g) \
+ (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
+ (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+ } while(0)
+
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+ unsigned long address,
+ int *type)
+{
+ /*
+ * if the page has not been mapped in by the driver then generate
+ * a SIGBUS to the domain.
+ */
+
+ force_sig(SIGBUS, current);
+
+ return 0;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+ nopage: blktap_nopage,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+ blkif_sring_t *sring;
- active_reqs_init();
- blkif_interface_init();
- blkdev_schedule_init();
+ if ( test_and_set_bit(0, &blktap_dev_inuse) )
+ return -EBUSY;
+
+ /* Allocate the fe ring. */
+ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+ if (sring == NULL)
+ goto fail_nomem;
+
+ SetPageReserved(virt_to_page(sring));
- (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
- CALLBACK_IN_BLOCKING_CONTEXT);
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
+
+ return 0;
+
+ fail_nomem:
+ return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+ blktap_dev_inuse = 0;
+ blktap_ring_ok = 0;
+
+ /* Free the ring page. */
+ ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
+ free_page((unsigned long) blktap_ufe_ring.sring);
+
+ /* Clear any active mappings and free foreign map table */
+ if (blktap_vma != NULL) {
+ zap_page_range(blktap_vma, blktap_vma->vm_start,
+ blktap_vma->vm_end - blktap_vma->vm_start, NULL);
+ blktap_vma = NULL;
+ }
+
+ return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them. This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space. This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms. vma->vm_private_data is set up as a mapping
+ * from pages to actual page structs. There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int size;
+ struct page **map;
+ int i;
+
+ DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+ vma->vm_start, vma->vm_end);
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_ops = &blktap_vm_ops;
+
+ size = vma->vm_end - vma->vm_start;
+ if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
+ printk(KERN_INFO
+ "blktap: you _must_ map exactly %d pages!\n",
+ MMAP_PAGES + RING_PAGES);
+ return -EAGAIN;
+ }
+
+ size >>= PAGE_SHIFT;
+ DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+
+ rings_vstart = vma->vm_start;
+ user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+
+ /* Map the ring pages to the start of the region and reserve it. */
+
+ /* not sure if I really need to do this... */
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_BE;
- cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_be_driver_status_t);
- be_st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &be_st, sizeof(be_st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+ if (remap_pfn_range(vma, vma->vm_start,
+ __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot))
+ {
+ WPRINTK("Mapping user ring failed!\n");
+ goto fail;
+ }
- DPRINTK(" tap - Userland channel init:\n");
+ /* Mark this VM as containing foreign pages, and set up mappings. */
+ map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+ * sizeof(struct page_struct*),
+ GFP_KERNEL);
+ if (map == NULL)
+ {
+ WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
+ goto fail;
+ }
- blktap_init();
+ for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+ map[i] = NULL;
+
+ vma->vm_private_data = map;
+ vma->vm_flags |= VM_FOREIGN;
- DPRINTK("Blkif tap device initialized.\n");
+ blktap_vma = vma;
+ blktap_ring_ok = 1;
return 0;
+ fail:
+ /* Clear any active mappings. */
+ zap_page_range(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start, NULL);
+
+ return -ENOMEM;
}
-#if 0 /* tap doesn't handle suspend/resume */
-void blkdev_suspend(void)
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
{
+ switch(cmd) {
+ case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
+ return blktap_read_ufe_ring();
+
+ case BLKTAP_IOCTL_SETMODE:
+ if (BLKTAP_MODE_VALID(arg)) {
+ blktap_mode = arg;
+ /* XXX: may need to flush rings here. */
+ printk(KERN_INFO "blktap: set mode to %lx\n", arg);
+ return 0;
+ }
+ case BLKTAP_IOCTL_PRINT_IDXS:
+ {
+ //print_fe_ring_idxs();
+ WPRINTK("User Rings: \n-----------\n");
+ WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
+ "| req_prod: %2d, rsp_prod: %2d\n",
+ blktap_ufe_ring.rsp_cons,
+ blktap_ufe_ring.req_prod_pvt,
+ blktap_ufe_ring.sring->req_prod,
+ blktap_ufe_ring.sring->rsp_prod);
+
+ }
+ }
+ return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &blktap_wait, wait);
+ if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) )
+ {
+ flush_tlb_all();
+
+ RING_PUSH_REQUESTS(&blktap_ufe_ring);
+ return POLLIN | POLLRDNORM;
+ }
+
+ return 0;
}
-void blkdev_resume(void)
+void blktap_kick_user(void)
{
- ctrl_msg_t cmsg;
- blkif_fe_driver_status_t st;
+ /* blktap_ring->req_prod = blktap_req_prod; */
+ wake_up_interruptible(&blktap_wait);
+}
+
+static struct file_operations blktap_fops = {
+ owner: THIS_MODULE,
+ poll: blktap_poll,
+ ioctl: blktap_ioctl,
+ open: blktap_open,
+ release: blktap_release,
+ mmap: blktap_mmap,
+};
+
+
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st);
+
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ unsigned int i, op = 0;
+ struct grant_handle_pair *handle;
+ unsigned long ptep;
+
+ for (i=0; i<nr_pages; i++)
+ {
+ handle = &pending_handle(idx, i);
+ if (!BLKTAP_INVALID_HANDLE(handle))
+ {
+
+ unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
+ unmap[op].dev_bus_addr = 0;
+ unmap[op].handle = handle->kernel;
+ op++;
+
+ if (create_lookup_pte_addr(blktap_vma->vm_mm,
+ MMAP_VADDR(user_vstart, idx, i),
+ &ptep) !=0) {
+ DPRINTK("Couldn't get a pte addr!\n");
+ return;
+ }
+ unmap[op].host_addr = ptep;
+ unmap[op].dev_bus_addr = 0;
+ unmap[op].handle = handle->user;
+ op++;
+
+ BLKTAP_INVALIDATE_HANDLE(handle);
+ }
+ }
+ if ( unlikely(HYPERVISOR_grant_table_op(
+ GNTTABOP_unmap_grant_ref, unmap, op)))
+ BUG();
+
+ if (blktap_vma != NULL)
+ zap_page_range(blktap_vma,
+ MMAP_VADDR(user_vstart, idx, 0),
+ nr_pages << PAGE_SHIFT, NULL);
+}
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head blkio_schedule_list;
+static spinlock_t blkio_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+ return blkif->blkdev_list.next != NULL;
+}
- /* Send a driver-UP notification to the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS;
- cmsg.length = sizeof(blkif_fe_driver_status_t);
- st.status = BLKIF_DRIVER_STATUS_UP;
- memcpy(cmsg.msg, &st, sizeof(st));
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( !__on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+ if ( __on_blkdev_list(blkif) )
+ {
+ list_del(&blkif->blkdev_list);
+ blkif->blkdev_list.next = NULL;
+ blkif_put(blkif);
+ }
+ spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( __on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&blkio_schedule_list_lock, flags);
+ if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+ {
+ list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
+ blkif_get(blkif);
+ }
+ spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
+
+static int blkio_schedule(void *arg)
+{
+ DECLARE_WAITQUEUE(wq, current);
+
+ blkif_t *blkif;
+ struct list_head *ent;
+
+ daemonize("xenblkd");
+
+ for ( ; ; )
+ {
+ /* Wait for work to do. */
+ add_wait_queue(&blkio_schedule_wait, &wq);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
+ list_empty(&blkio_schedule_list) )
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&blkio_schedule_wait, &wq);
+
+ /* Queue up a batch of requests. */
+ while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+ !list_empty(&blkio_schedule_list) )
+ {
+ ent = blkio_schedule_list.next;
+ blkif = list_entry(ent, blkif_t, blkdev_list);
+ blkif_get(blkif);
+ remove_from_blkdev_list(blkif);
+ if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+ add_to_blkdev_list_tail(blkif);
+ blkif_put(blkif);
+ }
+ }
+}
+
+static void maybe_trigger_blkio_schedule(void)
+{
+ /*
+ * Needed so that two processes, who together make the following predicate
+ * true, don't both read stale values and evaluate the predicate
+ * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+ */
+ smp_mb();
+
+ if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&blkio_schedule_list) )
+ wake_up(&blkio_schedule_wait);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+
+static int blktap_read_ufe_ring(void)
+{
+ /* This is called to read responses from the UFE ring. */
+
+ RING_IDX i, j, rp;
+ blkif_response_t *resp;
+ blkif_t *blkif;
+ int pending_idx;
+ pending_req_t *pending_req;
+ unsigned long flags;
+
+ /* if we are forwarding from UFERring to FERing */
+ if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
+
+ /* for each outstanding message on the UFEring */
+ rp = blktap_ufe_ring.sring->rsp_prod;
+ rmb();
+
+ for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
+ {
+ resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
+ pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
+ pending_req = &pending_reqs[pending_idx];
+
+ blkif = pending_req->blkif;
+ for (j = 0; j < pending_req->nr_pages; j++) {
+ unsigned long vaddr;
+ struct page **map = blktap_vma->vm_private_data;
+ int offset;
+
+ vaddr = MMAP_VADDR(user_vstart, pending_idx, j);
+ offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+
+ //ClearPageReserved(virt_to_page(vaddr));
+ ClearPageReserved((struct page *)map[offset]);
+ map[offset] = NULL;
+ }
+
+ fast_flush_area(pending_idx, pending_req->nr_pages);
+ make_response(blkif, pending_req->id, resp->operation,
+ resp->status);
+ blkif_put(pending_req->blkif);
+ spin_lock_irqsave(&pend_prod_lock, flags);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ spin_unlock_irqrestore(&pend_prod_lock, flags);
+ }
+ blktap_ufe_ring.rsp_cons = i;
+ maybe_trigger_blkio_schedule();
+ }
+ return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ blkif_t *blkif = dev_id;
+ add_to_blkdev_list_tail(blkif);
+ maybe_trigger_blkio_schedule();
+ return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+ blkif_request_t *req;
+ RING_IDX i, rp;
+ int more_to_do = 0;
+
+ rp = blk_ring->sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ for ( i = blk_ring->req_cons;
+ (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
+ i++ )
+ {
+ if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+ {
+ more_to_do = 1;
+ break;
+ }
+
+ req = RING_GET_REQUEST(blk_ring, i);
+ switch ( req->operation )
+ {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ dispatch_rw_block_io(blkif, req);
+ break;
+
+ default:
+ DPRINTK("error: unknown block io operation [%d]\n",
+ req->operation);
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ break;
+ }
+ }
+
+ blk_ring->req_cons = i;
+ blktap_kick_user();
+
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+ blkif_request_t *target;
+ int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+ pending_req_t *pending_req;
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+ int op, ret;
+ unsigned int nseg;
+
+ /* Check that number of segments is sane. */
+ nseg = req->nr_segments;
+ if ( unlikely(nseg == 0) ||
+ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+ {
+ DPRINTK("Bad number of segments in request (%d)\n", nseg);
+ goto bad_descriptor;
+ }
+
+ /* Make sure userspace is ready. */
+ if (!blktap_ring_ok) {
+ DPRINTK("blktap: ring not ready for requests!\n");
+ goto bad_descriptor;
+ }
+
+
+ if ( RING_FULL(&blktap_ufe_ring) ) {
+ WPRINTK("blktap: fe_ring is full, can't add (very broken!).\n");
+ goto bad_descriptor;
+ }
+
+ flush_cache_all(); /* a noop on intel... */
+
+ /* Map the foreign pages directly in to the application */
+ op = 0;
+ for (i=0; i<req->nr_segments; i++) {
+
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long ptep;
+
+ uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
+ kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
+
+ /* Map the remote page to kernel. */
+ map[op].host_addr = kvaddr;
+ map[op].dom = blkif->domid;
+ map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]);
+ map[op].flags = GNTMAP_host_map;
+ /* This needs a bit more thought in terms of interposition:
+ * If we want to be able to modify pages during write using
+ * grant table mappings, the guest will either need to allow
+ * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
+ if (req->operation == BLKIF_OP_WRITE)
+ map[op].flags |= GNTMAP_readonly;
+ op++;
+
+ /* Now map it to user. */
+ ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+ if (ret)
+ {
+ DPRINTK("Couldn't get a pte addr!\n");
+ fast_flush_area(pending_idx, req->nr_segments);
+ goto bad_descriptor;
+ }
+
+ map[op].host_addr = ptep;
+ map[op].dom = blkif->domid;
+ map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]);
+ map[op].flags = GNTMAP_host_map | GNTMAP_application_map
+ | GNTMAP_contains_pte;
+ /* Above interposition comment applies here as well. */
+ if (req->operation == BLKIF_OP_WRITE)
+ map[op].flags |= GNTMAP_readonly;
+ op++;
+ }
+
+ if ( unlikely(HYPERVISOR_grant_table_op(
+ GNTTABOP_map_grant_ref, map, op)))
+ BUG();
+
+ op = 0;
+ for (i=0; i<(req->nr_segments*2); i+=2) {
+ unsigned long uvaddr;
+ unsigned long kvaddr;
+ unsigned long offset;
+ int cancel = 0;
+
+ uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
+ kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
+
+ if ( unlikely(map[i].handle < 0) )
+ {
+ DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
+ ret = map[i].handle;
+ cancel = 1;
+ }
+
+ if ( unlikely(map[i+1].handle < 0) )
+ {
+ DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
+ ret = map[i+1].handle;
+ cancel = 1;
+ }
+
+ if (cancel)
+ {
+ fast_flush_area(pending_idx, req->nr_segments);
+ goto bad_descriptor;
+ }
+
+ /* Set the necessary mappings in p2m and in the VM_FOREIGN
+ * vm_area_struct to allow user vaddr -> struct page lookups
+ * to work. This is needed for direct IO to foreign pages. */
+ phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] =
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
+
+ offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+ ((struct page **)blktap_vma->vm_private_data)[offset] =
+ pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+
+ /* Save handles for unmapping later. */
+ pending_handle(pending_idx, i/2).kernel = map[i].handle;
+ pending_handle(pending_idx, i/2).user = map[i+1].handle;
+ }
+
+ /* Mark mapped pages as reserved: */
+ for ( i = 0; i < req->nr_segments; i++ )
+ {
+ unsigned long kvaddr;
+
+ kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
+ SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
+ }
+
+ pending_req = &pending_reqs[pending_idx];
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = req->operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+ req->id = MAKE_ID(blkif->domid, pending_idx);
+ //atomic_set(&pending_req->pendcnt, nbio);
+ pending_cons++;
+ blkif_get(blkif);
+
+ /* Finally, write the request message to the user ring. */
+ target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
+ memcpy(target, req, sizeof(*req));
+ blktap_ufe_ring.req_prod_pvt++;
+ return;
+
+ bad_descriptor:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st)
+{
+ blkif_response_t *resp;
+ unsigned long flags;
+ blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+
+ /* Place on the response ring for the relevant domain. */
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
+ resp->id = id;
+ resp->operation = op;
+ resp->status = st;
+ wmb(); /* Ensure other side can see the response fields. */
+ blk_ring->rsp_prod_pvt++;
+ RING_PUSH_RESPONSES(blk_ring);
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+ /* Kick the relevant domain. */
+ notify_via_evtchn(blkif->evtchn);
+}
+
+static struct miscdevice blktap_miscdev = {
+ .minor = BLKTAP_MINOR,
+ .name = "blktap",
+ .fops = &blktap_fops,
+ .devfs_name = "misc/blktap",
+};
+
+void blkif_deschedule(blkif_t *blkif)
+{
+ remove_from_blkdev_list(blkif);
+}
+
+static int __init blkif_init(void)
+{
+ int i, j, err;
+ struct page *page;
+/*
+ if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
+ !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
+ return 0;
+*/
+ blkif_interface_init();
+
+ page = balloon_alloc_empty_page_range(MMAP_PAGES);
+ BUG_ON(page == NULL);
+ mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ spin_lock_init(&blkio_schedule_list_lock);
+ INIT_LIST_HEAD(&blkio_schedule_list);
+
+ if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
+ BUG();
+
+ blkif_xenbus_init();
+
+ for (i=0; i<MAX_PENDING_REQS ; i++)
+ for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+ BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
+
+ err = misc_register(&blktap_miscdev);
+ if ( err != 0 )
+ {
+ printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
+ return err;
+ }
+
+ init_waitqueue_head(&blktap_wait);
+
+ return 0;
}
-#endif
-__initcall(xlblktap_init);
+__initcall(blkif_init);
+++ /dev/null
-/******************************************************************************
- * blktap_controlmsg.c
- *
- * XenLinux virtual block-device tap.
- * Control interfaces to the frontend and backend drivers.
- *
- * Copyright (c) 2004, Andrew Warfield
- *
- */
-
-#include "blktap.h"
-#include <asm-xen/evtchn.h>
-
-static char *blkif_state_name[] = {
- [BLKIF_STATE_CLOSED] = "closed",
- [BLKIF_STATE_DISCONNECTED] = "disconnected",
- [BLKIF_STATE_CONNECTED] = "connected",
-};
-
-static char *blkif_status_name[] = {
- [BLKIF_INTERFACE_STATUS_CLOSED] = "closed",
- [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
- [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected",
- [BLKIF_INTERFACE_STATUS_CHANGED] = "changed",
-};
-
-unsigned int blktap_be_state = BLKIF_STATE_CLOSED;
-unsigned int blktap_be_evtchn;
-
-/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-static kmem_cache_t *blkif_cachep;
-static blkif_t *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
- blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
- while ( (blkif != NULL) &&
- ((blkif->domid != domid) || (blkif->handle != handle)) )
- blkif = blkif->hash_next;
- return blkif;
-}
-
-static void __blkif_disconnect_complete(void *arg)
-{
- blkif_t *blkif = (blkif_t *)arg;
- ctrl_msg_t cmsg;
- blkif_be_disconnect_t disc;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- struct gnttab_unmap_grant_ref op;
-#endif
-
- /*
- * These can't be done in blkif_disconnect() because at that point there
- * may be outstanding requests at the disc whose asynchronous responses
- * must still be notified to the remote driver.
- */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- op.host_addr = blkif->shmem_vaddr;
- op.handle = blkif->shmem_handle;
- op.dev_bus_addr = 0;
- BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
-#endif
- vfree(blkif->blk_ring.sring);
-
- /* Construct the deferred response message. */
- cmsg.type = CMSG_BLKIF_BE;
- cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT;
- cmsg.id = blkif->disconnect_rspid;
- cmsg.length = sizeof(blkif_be_disconnect_t);
- disc.domid = blkif->domid;
- disc.blkif_handle = blkif->handle;
- disc.status = BLKIF_BE_STATUS_OKAY;
- memcpy(cmsg.msg, &disc, sizeof(disc));
-
- /*
- * Make sure message is constructed /before/ status change, because
- * after the status change the 'blkif' structure could be deallocated at
- * any time. Also make sure we send the response /after/ status change,
- * as otherwise a subsequent CONNECT request could spuriously fail if
- * another CPU doesn't see the status change yet.
- */
- mb();
- if ( blkif->status != DISCONNECTING )
- BUG();
- blkif->status = DISCONNECTED;
- mb();
-
- /* Send the successful response. */
- ctrl_if_send_response(&cmsg);
-}
-
-void blkif_disconnect_complete(blkif_t *blkif)
-{
- INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif);
- schedule_work(&blkif->work);
-}
-
-void blkif_ptfe_create(blkif_be_create_t *create)
-{
- blkif_t *blkif, **pblkif;
- domid_t domid = create->domid;
- unsigned int handle = create->blkif_handle;
-
-
- /* May want to store info on the connecting domain here. */
-
- DPRINTK("PT got BE_CREATE\n");
-
- if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
- {
- WPRINTK("Could not create blkif: out of memory\n");
- create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- return;
- }
-
- /* blkif struct init code from blkback.c */
- memset(blkif, 0, sizeof(*blkif));
- blkif->domid = domid;
- blkif->handle = handle;
- blkif->status = DISCONNECTED;
- spin_lock_init(&blkif->blk_ring_lock);
- atomic_set(&blkif->refcnt, 0);
-
- pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
- while ( *pblkif != NULL )
- {
- if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
- {
- WPRINTK("Could not create blkif: already exists\n");
- create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
- kmem_cache_free(blkif_cachep, blkif);
- return;
- }
- pblkif = &(*pblkif)->hash_next;
- }
-
- blkif->hash_next = *pblkif;
- *pblkif = blkif;
-
- create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-
-void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
-{
- /* Clear anything that we initialized above. */
-
- domid_t domid = destroy->domid;
- unsigned int handle = destroy->blkif_handle;
- blkif_t **pblkif, *blkif;
-
- DPRINTK("PT got BE_DESTROY\n");
-
- pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
- while ( (blkif = *pblkif) != NULL )
- {
- if ( (blkif->domid == domid) && (blkif->handle == handle) )
- {
- if ( blkif->status != DISCONNECTED )
- goto still_connected;
- goto destroy;
- }
- pblkif = &blkif->hash_next;
- }
-
- destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return;
-
- still_connected:
- destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
- return;
-
- destroy:
- *pblkif = blkif->hash_next;
- kmem_cache_free(blkif_cachep, blkif);
- destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_ptfe_connect(blkif_be_connect_t *connect)
-{
- domid_t domid = connect->domid;
- unsigned int handle = connect->blkif_handle;
- unsigned int evtchn = connect->evtchn;
- unsigned long shmem_frame = connect->shmem_frame;
- struct vm_struct *vma;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- int ref = connect->shmem_ref;
-#else
- pgprot_t prot;
- int error;
-#endif
- blkif_t *blkif;
- blkif_sring_t *sring;
-
- DPRINTK("PT got BE_CONNECT\n");
-
- blkif = blkif_find_by_handle(domid, handle);
- if ( unlikely(blkif == NULL) )
- {
- WPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n",
- connect->domid, connect->blkif_handle);
- connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return;
- }
-
- if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
- {
- connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- return;
- }
-
-#ifndef CONFIG_XEN_BLKDEV_GRANT
- prot = __pgprot(_KERNPG_TABLE);
- error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
- shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
- prot, domid);
- if ( error != 0 )
- {
- if ( error == -ENOMEM )
- connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
- else if ( error == -EFAULT )
- connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
- else
- connect->status = BLKIF_BE_STATUS_ERROR;
- vfree(vma->addr);
- return;
- }
-#else
- { /* Map: Use the Grant table reference */
- struct gnttab_map_grant_ref op;
- op.host_addr = VMALLOC_VMADDR(vma->addr);
- op.flags = GNTMAP_host_map;
- op.ref = ref;
- op.dom = domid;
-
- BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
-
- handle = op.handle;
-
- if (op.handle < 0) {
- DPRINTK(" Grant table operation failure !\n");
- connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
- vfree(vma->addr);
- return;
- }
-
- blkif->shmem_ref = ref;
- blkif->shmem_handle = handle;
- blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr);
- }
-#endif
-
- if ( blkif->status != DISCONNECTED )
- {
- connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
- vfree(vma->addr);
- return;
- }
-
- sring = (blkif_sring_t *)vma->addr;
- SHARED_RING_INIT(sring);
- BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
-
- blkif->evtchn = evtchn;
- blkif->shmem_frame = shmem_frame;
- blkif->status = CONNECTED;
- blkif_get(blkif);
-
- bind_evtchn_to_irqhandler(
- evtchn, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
-
- connect->status = BLKIF_BE_STATUS_OKAY;
-}
-
-int blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
-{
- domid_t domid = disconnect->domid;
- unsigned int handle = disconnect->blkif_handle;
- blkif_t *blkif;
-
- DPRINTK("PT got BE_DISCONNECT\n");
-
- blkif = blkif_find_by_handle(domid, handle);
- if ( unlikely(blkif == NULL) )
- {
- WPRINTK("blkif_disconnect attempted for non-existent blkif"
- " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle);
- disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
- return 1; /* Caller will send response error message. */
- }
-
- if ( blkif->status == CONNECTED )
- {
- blkif->status = DISCONNECTING;
- blkif->disconnect_rspid = rsp_id;
- wmb(); /* Let other CPUs see the status change. */
- unbind_evtchn_from_irqhandler(blkif->evtchn, blkif);
- blkif_deschedule(blkif);
- blkif_put(blkif);
- return 0; /* Caller should not send response message. */
- }
-
- disconnect->status = BLKIF_BE_STATUS_OKAY;
- return 1;
-}
-
-/*-----[ Control Messages to/from Backend VM ]----------------------------*/
-
-/* Tell the controller to bring up the interface. */
-static void blkif_ptbe_send_interface_connect(void)
-{
- ctrl_msg_t cmsg = {
- .type = CMSG_BLKIF_FE,
- .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
- .length = sizeof(blkif_fe_interface_connect_t),
- };
- blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
- msg->handle = 0;
- msg->shmem_frame = virt_to_mfn(blktap_be_ring.sring);
-
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void blkif_ptbe_close(void)
-{
-}
-
-/* Move from CLOSED to DISCONNECTED state. */
-static void blkif_ptbe_disconnect(void)
-{
- blkif_sring_t *sring;
-
- sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
- SHARED_RING_INIT(sring);
- FRONT_RING_INIT(&blktap_be_ring, sring, PAGE_SIZE);
- blktap_be_state = BLKIF_STATE_DISCONNECTED;
- DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
- blkif_ptbe_send_interface_connect();
-}
-
-static void blkif_ptbe_connect(blkif_fe_interface_status_t *status)
-{
- int err = 0;
-
- blktap_be_evtchn = status->evtchn;
-
- err = bind_evtchn_to_irqhandler(
- blktap_be_evtchn, blkif_ptbe_int, SA_SAMPLE_RANDOM, "blkif", NULL);
- if ( err ) {
- WPRINTK("blkfront bind_evtchn_to_irqhandler failed (%d)\n", err);
- return;
- } else {
- /* transtion to connected in case we need to do a
- a partion probe on a whole disk */
- blktap_be_state = BLKIF_STATE_CONNECTED;
- }
-}
-
-static void unexpected(blkif_fe_interface_status_t *status)
-{
- WPRINTK(" TAP: Unexpected blkif status %s in state %s\n",
- blkif_status_name[status->status],
- blkif_state_name[blktap_be_state]);
-}
-
-static void blkif_ptbe_status(
- blkif_fe_interface_status_t *status)
-{
- if ( status->handle != 0 )
- {
- DPRINTK("Status change on unsupported blkif %d\n",
- status->handle);
- return;
- }
-
- DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]);
-
- switch ( status->status )
- {
- case BLKIF_INTERFACE_STATUS_CLOSED:
- switch ( blktap_be_state )
- {
- case BLKIF_STATE_CLOSED:
- unexpected(status);
- break;
- case BLKIF_STATE_DISCONNECTED:
- case BLKIF_STATE_CONNECTED:
- unexpected(status);
- blkif_ptbe_close();
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_DISCONNECTED:
- switch ( blktap_be_state )
- {
- case BLKIF_STATE_CLOSED:
- blkif_ptbe_disconnect();
- break;
- case BLKIF_STATE_DISCONNECTED:
- case BLKIF_STATE_CONNECTED:
- printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n");
- unexpected(status);
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_CONNECTED:
- switch ( blktap_be_state )
- {
- case BLKIF_STATE_CLOSED:
- unexpected(status);
- blkif_ptbe_disconnect();
- blkif_ptbe_connect(status);
- break;
- case BLKIF_STATE_DISCONNECTED:
- blkif_ptbe_connect(status);
- break;
- case BLKIF_STATE_CONNECTED:
- unexpected(status);
- blkif_ptbe_connect(status);
- break;
- }
- break;
-
- case BLKIF_INTERFACE_STATUS_CHANGED:
- switch ( blktap_be_state )
- {
- case BLKIF_STATE_CLOSED:
- case BLKIF_STATE_DISCONNECTED:
- unexpected(status);
- break;
- case BLKIF_STATE_CONNECTED:
- /* vbd_update(); */
- /* tap doesn't really get state changes... */
- unexpected(status);
- break;
- }
- break;
-
- default:
- DPRINTK("Status change to unknown value %d\n", status->status);
- break;
- }
-}
-
-/*-----[ All control messages enter here: ]-------------------------------*/
-
-void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
- switch ( msg->type )
- {
- case CMSG_BLKIF_FE:
-
- switch ( msg->subtype )
- {
- case CMSG_BLKIF_FE_INTERFACE_STATUS:
- blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]);
- break;
-
- default:
- goto parse_error;
- }
-
- break;
-
- case CMSG_BLKIF_BE:
-
- /* send a copy of the message to user if wanted */
-
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-
- blktap_write_ctrl_ring(msg);
- blktap_kick_user();
- }
-
- switch ( msg->subtype )
- {
- case CMSG_BLKIF_BE_CREATE:
- blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_DESTROY:
- blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_CONNECT:
- blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]);
- break;
- case CMSG_BLKIF_BE_DISCONNECT:
- if ( !blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0],
- msg->id) )
- return;
- break;
-
- /* We just ignore anything to do with vbds for now. */
-
- case CMSG_BLKIF_BE_VBD_CREATE:
- DPRINTK("PT got VBD_CREATE\n");
- ((blkif_be_vbd_create_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- case CMSG_BLKIF_BE_VBD_DESTROY:
- DPRINTK("PT got VBD_DESTROY\n");
- ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status
- = BLKIF_BE_STATUS_OKAY;
- break;
- default:
- goto parse_error;
- }
-
- break;
- }
-
- ctrl_if_send_response(msg);
- return;
-
- parse_error:
- msg->length = 0;
- ctrl_if_send_response(msg);
-}
-
-/*-----[ Initialization ]-------------------------------------------------*/
-
-void __init blkif_interface_init(void)
-{
- blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
- 0, 0, NULL, NULL);
- memset(blkif_hash, 0, sizeof(blkif_hash));
-
- blktap_be_ring.sring = NULL;
-}
-
-
-
-/* Debug : print the current ring indices. */
-
-void print_fe_ring_idxs(void)
-{
- int i;
- blkif_t *blkif;
-
- WPRINTK("FE Rings: \n---------\n");
- for ( i = 0; i < BLKIF_HASHSZ; i++) {
- blkif = blkif_hash[i];
- while (blkif != NULL) {
- if (blkif->status == DISCONNECTED) {
- WPRINTK("(%2d,%2d) DISCONNECTED\n",
- blkif->domid, blkif->handle);
- } else if (blkif->status == DISCONNECTING) {
- WPRINTK("(%2d,%2d) DISCONNECTING\n",
- blkif->domid, blkif->handle);
- } else if (blkif->blk_ring.sring == NULL) {
- WPRINTK("(%2d,%2d) CONNECTED, but null sring!\n",
- blkif->domid, blkif->handle);
- } else {
- blkif_get(blkif);
- WPRINTK("(%2d,%2d): req_cons: %2d, rsp_prod_prv: %2d "
- "| req_prod: %2d, rsp_prod: %2d\n",
- blkif->domid, blkif->handle,
- blkif->blk_ring.req_cons,
- blkif->blk_ring.rsp_prod_pvt,
- blkif->blk_ring.sring->req_prod,
- blkif->blk_ring.sring->rsp_prod);
- blkif_put(blkif);
- }
- blkif = blkif->hash_next;
- }
- }
-}
+++ /dev/null
-/******************************************************************************
- * blktap_datapath.c
- *
- * XenLinux virtual block-device tap.
- * Block request routing data path.
- *
- * Copyright (c) 2004, Andrew Warfield
- * -- see full header in blktap.c
- */
-
-#include "blktap.h"
-#include <asm-xen/evtchn.h>
-
-/*-----[ The data paths ]-------------------------------------------------*/
-
-/* Connection to a single backend domain. */
-blkif_front_ring_t blktap_be_ring;
-
-/*-----[ Tracking active requests ]---------------------------------------*/
-
-/* this must be the same as MAX_PENDING_REQS in blkback.c */
-#define MAX_ACTIVE_REQS ((ACTIVE_RING_IDX)64U)
-
-active_req_t active_reqs[MAX_ACTIVE_REQS];
-ACTIVE_RING_IDX active_req_ring[MAX_ACTIVE_REQS];
-spinlock_t active_req_lock = SPIN_LOCK_UNLOCKED;
-ACTIVE_RING_IDX active_prod, active_cons;
-#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
-#define ACTIVE_IDX(_ar) (_ar - active_reqs)
-#define NR_ACTIVE_REQS (MAX_ACTIVE_REQS - active_prod + active_cons)
-
-inline active_req_t *get_active_req(void)
-{
- ACTIVE_RING_IDX idx;
- active_req_t *ar;
- unsigned long flags;
-
- ASSERT(active_cons != active_prod);
-
- spin_lock_irqsave(&active_req_lock, flags);
- idx = active_req_ring[MASK_ACTIVE_IDX(active_cons++)];
- ar = &active_reqs[idx];
- spin_unlock_irqrestore(&active_req_lock, flags);
-
- return ar;
-}
-
-inline void free_active_req(active_req_t *ar)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&active_req_lock, flags);
- active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
- spin_unlock_irqrestore(&active_req_lock, flags);
-}
-
-active_req_t *lookup_active_req(ACTIVE_RING_IDX idx)
-{
- return &active_reqs[idx];
-}
-
-void active_reqs_init(void)
-{
- ACTIVE_RING_IDX i;
-
- active_cons = 0;
- active_prod = MAX_ACTIVE_REQS;
- memset(active_reqs, 0, sizeof(active_reqs));
- for ( i = 0; i < MAX_ACTIVE_REQS; i++ )
- active_req_ring[i] = i;
-}
-
-/* Requests passing through the tap to the backend hijack the id field
- * in the request message. In it we put the AR index _AND_ the fe domid.
- * the domid is used by the backend to map the pages properly.
- */
-
-static inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx)
-{
- return ( (fe_dom << 16) | MASK_ACTIVE_IDX(idx) );
-}
-
-/*-----[ Ring helpers ]---------------------------------------------------*/
-
-static void maybe_trigger_blktap_schedule(void);
-
-inline int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp)
-{
- blkif_response_t *resp_d;
- active_req_t *ar;
-
- ar = &active_reqs[ID_TO_IDX(rsp->id)];
- rsp->id = ar->id;
-
- resp_d = RING_GET_RESPONSE(&blkif->blk_ring,
- blkif->blk_ring.rsp_prod_pvt);
- memcpy(resp_d, rsp, sizeof(blkif_response_t));
- wmb();
- blkif->blk_ring.rsp_prod_pvt++;
-
- blkif_put(ar->blkif);
- free_active_req(ar);
-
- return 0;
-}
-
-inline int write_req_to_be_ring(blkif_request_t *req)
-{
- blkif_request_t *req_d;
-
- if ( blktap_be_state != BLKIF_STATE_CONNECTED ) {
- WPRINTK("Tap trying to access an unconnected backend!\n");
- return 0;
- }
-
- req_d = RING_GET_REQUEST(&blktap_be_ring,
- blktap_be_ring.req_prod_pvt);
- memcpy(req_d, req, sizeof(blkif_request_t));
- wmb();
- blktap_be_ring.req_prod_pvt++;
-
- return 0;
-}
-
-void kick_fe_domain(blkif_t *blkif)
-{
- RING_PUSH_RESPONSES(&blkif->blk_ring);
- notify_via_evtchn(blkif->evtchn);
- DPRINTK("notified FE(dom %u)\n", blkif->domid);
-
- /* We just feed up a batch of request slots... */
- maybe_trigger_blktap_schedule();
-
-}
-
-void kick_be_domain(void)
-{
- if ( blktap_be_state != BLKIF_STATE_CONNECTED )
- return;
-
- wmb(); /* Ensure that the frontend can see the requests. */
- RING_PUSH_REQUESTS(&blktap_be_ring);
- notify_via_evtchn(blktap_be_evtchn);
- DPRINTK("notified BE\n");
-}
-
-/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
-
-/*-----[ Scheduler list maint -from blkback ]--- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
- return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
- unsigned long flags;
- if ( !__on_blkdev_list(blkif) ) return;
- spin_lock_irqsave(&blkio_schedule_list_lock, flags);
- if ( __on_blkdev_list(blkif) )
- {
- list_del(&blkif->blkdev_list);
- blkif->blkdev_list.next = NULL;
- blkif_put(blkif);
- }
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
- unsigned long flags;
- if ( __on_blkdev_list(blkif) ) return;
- spin_lock_irqsave(&blkio_schedule_list_lock, flags);
- if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
- {
- list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
- blkif_get(blkif);
- }
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
-/*-----[ Scheduler functions - from blkback ]--- */
-
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-
-static int blkio_schedule(void *arg)
-{
- DECLARE_WAITQUEUE(wq, current);
-
- blkif_t *blkif;
- struct list_head *ent;
-
- daemonize(
- "xentapd"
- );
-
- for ( ; ; )
- {
- /* Wait for work to do. */
- add_wait_queue(&blkio_schedule_wait, &wq);
- set_current_state(TASK_INTERRUPTIBLE);
- if ( (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) ||
- list_empty(&blkio_schedule_list) )
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&blkio_schedule_wait, &wq);
-
- /* Queue up a batch of requests. */
- while ( (NR_ACTIVE_REQS < MAX_ACTIVE_REQS) &&
- !list_empty(&blkio_schedule_list) )
- {
- ent = blkio_schedule_list.next;
- blkif = list_entry(ent, blkif_t, blkdev_list);
- blkif_get(blkif);
- remove_from_blkdev_list(blkif);
- if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
- add_to_blkdev_list_tail(blkif);
- blkif_put(blkif);
- }
- }
-}
-
-static void maybe_trigger_blktap_schedule(void)
-{
- /*
- * Needed so that two processes, who together make the following predicate
- * true, don't both read stale values and evaluate the predicate
- * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
- */
- smp_mb();
-
- if ( (NR_ACTIVE_REQS < (MAX_ACTIVE_REQS/2)) &&
- !list_empty(&blkio_schedule_list) )
- wake_up(&blkio_schedule_wait);
-}
-
-void blkif_deschedule(blkif_t *blkif)
-{
- remove_from_blkdev_list(blkif);
-}
-
-void __init blkdev_schedule_init(void)
-{
- spin_lock_init(&blkio_schedule_list_lock);
- INIT_LIST_HEAD(&blkio_schedule_list);
-
- if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
- BUG();
-}
-
-/*-----[ Interrupt entry from a frontend ]------ */
-
-irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
-{
- blkif_t *blkif = dev_id;
-
- add_to_blkdev_list_tail(blkif);
- maybe_trigger_blktap_schedule();
- return IRQ_HANDLED;
-}
-
-/*-----[ Other Frontend Ring functions ]-------- */
-
-/* irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)*/
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
-{
- /* we have pending messages from the real frontend. */
-
- blkif_request_t *req_s;
- RING_IDX i, rp;
- unsigned long flags;
- active_req_t *ar;
- int more_to_do = 0;
- int notify_be = 0, notify_user = 0;
-
- /* lock both rings */
- spin_lock_irqsave(&blkif_io_lock, flags);
-
- rp = blkif->blk_ring.sring->req_prod;
- rmb();
-
- for ( i = blkif->blk_ring.req_cons;
- (i != rp) &&
- !RING_REQUEST_CONS_OVERFLOW(&blkif->blk_ring, i);
- i++ )
- {
-
- if ((--max_to_do == 0) || (NR_ACTIVE_REQS == MAX_ACTIVE_REQS))
- {
- more_to_do = 1;
- break;
- }
-
- req_s = RING_GET_REQUEST(&blkif->blk_ring, i);
- /* This is a new request:
- * Assign an active request record, and remap the id.
- */
- ar = get_active_req();
- ar->id = req_s->id;
- ar->nr_pages = req_s->nr_segments;
- blkif_get(blkif);
- ar->blkif = blkif;
- req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar));
- /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */
-
- /* FE -> BE interposition point is here. */
-
- /* ------------------------------------------------------------- */
- /* BLKIF_OP_PROBE_HACK: */
- /* Signal to the backend that we are a tap domain. */
-
- if (req_s->operation == BLKIF_OP_PROBE) {
- DPRINTK("Adding BLKTAP_COOKIE to PROBE request.\n");
- req_s->frame_and_sects[1] = BLKTAP_COOKIE;
- }
-
- /* ------------------------------------------------------------- */
-
- /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
-
- /* Copy the response message to UFERing */
- /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
- /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
-
- DPRINTK("req->UFERing\n");
- blktap_write_fe_ring(req_s);
- notify_user = 1;
- }
-
- /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
-
- /* be included to prevent noise from the fe when its off */
- /* copy the request message to the BERing */
-
- DPRINTK("blktap: FERing[%u] -> BERing[%u]\n",
- (unsigned)i & (RING_SIZE(&blktap_be_ring)-1),
- (unsigned)blktap_be_ring.req_prod_pvt &
- (RING_SIZE((&blktap_be_ring)-1)));
-
- write_req_to_be_ring(req_s);
- notify_be = 1;
- }
- }
-
- blkif->blk_ring.req_cons = i;
-
- /* unlock rings */
- spin_unlock_irqrestore(&blkif_io_lock, flags);
-
- if (notify_user)
- blktap_kick_user();
- if (notify_be)
- kick_be_domain();
-
- return more_to_do;
-}
-
-/*-----[ Data to/from Backend (server) VM ]------------------------------*/
-
-
-irqreturn_t blkif_ptbe_int(int irq, void *dev_id,
- struct pt_regs *ptregs)
-{
- blkif_response_t *resp_s;
- blkif_t *blkif;
- RING_IDX rp, i;
- unsigned long flags;
-
- DPRINTK("PT got BE interrupt.\n");
-
- /* lock both rings */
- spin_lock_irqsave(&blkif_io_lock, flags);
-
- rp = blktap_be_ring.sring->rsp_prod;
- rmb();
-
- for ( i = blktap_be_ring.rsp_cons; i != rp; i++)
- {
- resp_s = RING_GET_RESPONSE(&blktap_be_ring, i);
-
- /* BE -> FE interposition point is here. */
-
- blkif = active_reqs[ID_TO_IDX(resp_s->id)].blkif;
-
- /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
- if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
-
- /* Copy the response message to UBERing */
- /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
- /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
-
- DPRINTK("rsp->UBERing\n");
- blktap_write_be_ring(resp_s);
- blktap_kick_user();
-
- }
-
- /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */
- if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
- (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
-
- /* (fe included to prevent random interference from the BE) */
- /* Copy the response message to FERing */
-
- DPRINTK("blktap: BERing[%u] -> FERing[%u]\n",
- (unsigned)i & (RING_SIZE(&blkif->blk_ring)-1),
- (unsigned)blkif->blk_ring.rsp_prod_pvt &
- (RING_SIZE((&blkif->blk_ring)-1)));
-
- write_resp_to_fe_ring(blkif, resp_s);
- kick_fe_domain(blkif);
-
- }
- }
-
- blktap_be_ring.rsp_cons = i;
-
-
- spin_unlock_irqrestore(&blkif_io_lock, flags);
-
- return IRQ_HANDLED;
-}
-
-/* Debug : print the current ring indices. */
-
-void print_be_ring_idxs(void)
-{
- if (blktap_be_ring.sring != NULL) {
- WPRINTK("BE Ring: \n--------\n");
- WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d "
- "| req_prod: %2d, rsp_prod: %2d\n",
- blktap_be_ring.rsp_cons,
- blktap_be_ring.req_prod_pvt,
- blktap_be_ring.sring->req_prod,
- blktap_be_ring.sring->rsp_prod);
- }
-}
+++ /dev/null
-/******************************************************************************
- * blktap_userdev.c
- *
- * XenLinux virtual block-device tap.
- * Control interface between the driver and a character device.
- *
- * Copyright (c) 2004, Andrew Warfield
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/errno.h>
-#include <linux/major.h>
-#include <linux/gfp.h>
-#include <linux/poll.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-#include <asm-xen/xen-public/grant_table.h>
-#endif
-
-#include "blktap.h"
-
-
-unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH;
-
-/* Only one process may open /dev/xen/blktap at any time. */
-static unsigned long blktap_dev_inuse;
-unsigned long blktap_ring_ok; /* make this ring->state */
-
-/* for poll: */
-static wait_queue_head_t blktap_wait;
-
-/* Rings up to user space. */
-static blkif_front_ring_t blktap_ufe_ring;
-static blkif_back_ring_t blktap_ube_ring;
-static ctrl_front_ring_t blktap_uctrl_ring;
-
-/* local prototypes */
-static int blktap_read_fe_ring(void);
-static int blktap_read_be_ring(void);
-
-
-/* -------[ mmap region ]--------------------------------------------- */
-/*
- * We use a big chunk of address space to map in-flight requests into,
- * and export this region up to user-space. See the comments in blkback
- * about this -- the two must be kept in sync if the tap is used as a
- * passthrough.
- */
-
-#define MAX_PENDING_REQS 64
-
-/* immediately before the mmap area, we have a bunch of pages reserved
- * for shared memory rings.
- */
-#define RING_PAGES 3 /* Ctrl, Front, and Back */
-
-/* Where things are inside the device mapping. */
-struct vm_area_struct *blktap_vma = NULL;
-unsigned long mmap_vstart; /* Kernel pages for mapping in data. */
-unsigned long rings_vstart; /* start of mmaped vma */
-unsigned long user_vstart; /* start of user mappings */
-
-#define MMAP_PAGES_PER_REQUEST \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES \
- (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_start, _req,_seg) \
- ( _start + \
- ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * PAGE_SIZE))
-
-/* -------[ grant handles ]------------------------------------------- */
-
-#ifdef CONFIG_XEN_BLKDEV_GRANT
-/* When using grant tables to map a frame for device access then the
- * handle returned must be used to unmap the frame. This is needed to
- * drop the ref count on the frame.
- */
-struct grant_handle_pair
-{
- u16 kernel;
- u16 user;
-};
-static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
- (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKTAP_INVALID_HANDLE(_g) \
- (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
-#define BLKTAP_INVALIDATE_HANDLE(_g) do { \
- (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
- } while(0)
-
-#endif
-
-
-/* -------[ blktap vm ops ]------------------------------------------- */
-
-static struct page *blktap_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
-{
- /*
- * if the page has not been mapped in by the driver then generate
- * a SIGBUS to the domain.
- */
-
- force_sig(SIGBUS, current);
-
- return 0;
-}
-
-struct vm_operations_struct blktap_vm_ops = {
- nopage: blktap_nopage,
-};
-
-/* -------[ blktap file ops ]----------------------------------------- */
-
-static int blktap_open(struct inode *inode, struct file *filp)
-{
- blkif_sring_t *sring;
- ctrl_sring_t *csring;
-
- if ( test_and_set_bit(0, &blktap_dev_inuse) )
- return -EBUSY;
-
- /* Allocate the ctrl ring. */
- csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL);
- if (csring == NULL)
- goto fail_nomem;
-
- SetPageReserved(virt_to_page(csring));
-
- SHARED_RING_INIT(csring);
- FRONT_RING_INIT(&blktap_uctrl_ring, csring, PAGE_SIZE);
-
- /* Allocate the fe ring. */
- sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
- if (sring == NULL)
- goto fail_free_ctrl;
-
- SetPageReserved(virt_to_page(sring));
-
- SHARED_RING_INIT(sring);
- FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
-
- /* Allocate the be ring. */
- sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
- if (sring == NULL)
- goto fail_free_fe;
-
- SetPageReserved(virt_to_page(sring));
-
- SHARED_RING_INIT(sring);
- BACK_RING_INIT(&blktap_ube_ring, sring, PAGE_SIZE);
-
- DPRINTK(KERN_ALERT "blktap open.\n");
-
- return 0;
-
- fail_free_ctrl:
- free_page( (unsigned long) blktap_uctrl_ring.sring);
-
- fail_free_fe:
- free_page( (unsigned long) blktap_ufe_ring.sring);
-
- fail_nomem:
- return -ENOMEM;
-}
-
-static int blktap_release(struct inode *inode, struct file *filp)
-{
- blktap_dev_inuse = 0;
- blktap_ring_ok = 0;
-
- DPRINTK(KERN_ALERT "blktap closed.\n");
-
- /* Free the ring page. */
- ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring));
- free_page((unsigned long) blktap_uctrl_ring.sring);
-
- ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
- free_page((unsigned long) blktap_ufe_ring.sring);
-
- ClearPageReserved(virt_to_page(blktap_ube_ring.sring));
- free_page((unsigned long) blktap_ube_ring.sring);
-
- /* Clear any active mappings and free foreign map table */
- if (blktap_vma != NULL) {
- zap_page_range(blktap_vma, blktap_vma->vm_start,
- blktap_vma->vm_end - blktap_vma->vm_start, NULL);
- blktap_vma = NULL;
- }
-
- return 0;
-}
-
-/* Note on mmap:
- * We need to map pages to user space in a way that will allow the block
- * subsystem set up direct IO to them. This couldn't be done before, because
- * there isn't really a sane way to make a user virtual address down to a
- * physical address when the page belongs to another domain.
- *
- * My first approach was to map the page in to kernel memory, add an entry
- * for it in the physical frame list (using alloc_lomem_region as in blkback)
- * and then attempt to map that page up to user space. This is disallowed
- * by xen though, which realizes that we don't really own the machine frame
- * underlying the physical page.
- *
- * The new approach is to provide explicit support for this in xen linux.
- * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
- * mapped from other vms. vma->vm_private_data is set up as a mapping
- * from pages to actual page structs. There is a new clause in get_user_pages
- * that does the right thing for this sort of mapping.
- *
- * blktap_mmap sets up this mapping. Most of the real work is done in
- * blktap_write_fe_ring below.
- */
-static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- int size;
- struct page **map;
- int i;
-
- DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
- vma->vm_start, vma->vm_end);
-
- vma->vm_flags |= VM_RESERVED;
- vma->vm_ops = &blktap_vm_ops;
-
- size = vma->vm_end - vma->vm_start;
- if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
- printk(KERN_INFO
- "blktap: you _must_ map exactly %d pages!\n",
- MMAP_PAGES + RING_PAGES);
- return -EAGAIN;
- }
-
- size >>= PAGE_SHIFT;
- DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
-
- rings_vstart = vma->vm_start;
- user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT);
-
- /* Map the ring pages to the start of the region and reserve it. */
-
- /* not sure if I really need to do this... */
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
- DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring));
- if (remap_pfn_range(vma, vma->vm_start,
- __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot))
- goto fail;
-
-
- DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
- if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE,
- __pa(blktap_ube_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot))
- goto fail;
-
- DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
- if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ),
- __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT,
- PAGE_SIZE, vma->vm_page_prot))
- goto fail;
-
- /* Mark this VM as containing foreign pages, and set up mappings. */
- map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
- * sizeof(struct page_struct*),
- GFP_KERNEL);
- if (map == NULL) goto fail;
-
- for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
- map[i] = NULL;
-
- vma->vm_private_data = map;
- vma->vm_flags |= VM_FOREIGN;
-
- blktap_vma = vma;
- blktap_ring_ok = 1;
-
- return 0;
- fail:
- /* Clear any active mappings. */
- zap_page_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start, NULL);
-
- return -ENOMEM;
-}
-
-static int blktap_ioctl(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- switch(cmd) {
- case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
- return blktap_read_fe_ring();
-
- case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */
- return blktap_read_be_ring();
-
- case BLKTAP_IOCTL_SETMODE:
- if (BLKTAP_MODE_VALID(arg)) {
- blktap_mode = arg;
- /* XXX: may need to flush rings here. */
- printk(KERN_INFO "blktap: set mode to %lx\n", arg);
- return 0;
- }
- case BLKTAP_IOCTL_PRINT_IDXS:
- {
- print_be_ring_idxs();
- print_fe_ring_idxs();
- WPRINTK("User Rings: \n-----------\n");
- WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
- "| req_prod: %2d, rsp_prod: %2d\n",
- blktap_ufe_ring.rsp_cons,
- blktap_ufe_ring.req_prod_pvt,
- blktap_ufe_ring.sring->req_prod,
- blktap_ufe_ring.sring->rsp_prod);
- WPRINTK("UB: req_cons: %2d, rsp_prod_prv: %2d "
- "| req_prod: %2d, rsp_prod: %2d\n",
- blktap_ube_ring.req_cons,
- blktap_ube_ring.rsp_prod_pvt,
- blktap_ube_ring.sring->req_prod,
- blktap_ube_ring.sring->rsp_prod);
-
- }
- }
- return -ENOIOCTLCMD;
-}
-
-static unsigned int blktap_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &blktap_wait, wait);
-
- if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_uctrl_ring) ||
- RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) ||
- RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) {
-
- flush_tlb_all();
-
- RING_PUSH_REQUESTS(&blktap_uctrl_ring);
- RING_PUSH_REQUESTS(&blktap_ufe_ring);
- RING_PUSH_RESPONSES(&blktap_ube_ring);
- return POLLIN | POLLRDNORM;
- }
-
- return 0;
-}
-
-void blktap_kick_user(void)
-{
- /* blktap_ring->req_prod = blktap_req_prod; */
- wake_up_interruptible(&blktap_wait);
-}
-
-static struct file_operations blktap_fops = {
- owner: THIS_MODULE,
- poll: blktap_poll,
- ioctl: blktap_ioctl,
- open: blktap_open,
- release: blktap_release,
- mmap: blktap_mmap,
-};
-
-/*-----[ Data to/from user space ]----------------------------------------*/
-
-static void fast_flush_area(int idx, int nr_pages)
-{
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
- unsigned int i, op = 0;
- struct grant_handle_pair *handle;
- unsigned long ptep;
-
- for (i=0; i<nr_pages; i++)
- {
- handle = &pending_handle(idx, i);
- if (!BLKTAP_INVALID_HANDLE(handle))
- {
-
- unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
- unmap[op].dev_bus_addr = 0;
- unmap[op].handle = handle->kernel;
- op++;
-
- if (create_lookup_pte_addr(blktap_vma->vm_mm,
- MMAP_VADDR(user_vstart, idx, i),
- &ptep) !=0) {
- DPRINTK("Couldn't get a pte addr!\n");
- return;
- }
- unmap[op].host_addr = ptep;
- unmap[op].dev_bus_addr = 0;
- unmap[op].handle = handle->user;
- op++;
-
- BLKTAP_INVALIDATE_HANDLE(handle);
- }
- }
- if ( unlikely(HYPERVISOR_grant_table_op(
- GNTTABOP_unmap_grant_ref, unmap, op)))
- BUG();
-#else
- multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- int i;
-
- for ( i = 0; i < nr_pages; i++ )
- {
- MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i),
- __pte(0), 0);
- }
-
- mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
- if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
- BUG();
-#endif
-}
-
-
-int blktap_write_fe_ring(blkif_request_t *req)
-{
- blkif_request_t *target;
- int i, ret = 0;
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
- int op;
-#else
- unsigned long remap_prot;
- multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
- mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-#endif
-
- /*
- * This is called to pass a request from the real frontend domain's
- * blkif ring to the character device.
- */
-
- if ( ! blktap_ring_ok ) {
- DPRINTK("blktap: ufe_ring not ready for a request!\n");
- return 0;
- }
-
- if ( RING_FULL(&blktap_ufe_ring) ) {
- PRINTK("blktap: fe_ring is full, can't add.\n");
- return 0;
- }
-
- flush_cache_all(); /* a noop on intel... */
-
- target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
- memcpy(target, req, sizeof(*req));
-
- /* Map the foreign pages directly in to the application */
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- op = 0;
- for (i=0; i<target->nr_segments; i++) {
-
- unsigned long uvaddr;
- unsigned long kvaddr;
- unsigned long ptep;
-
- uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
- kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
-
- /* Map the remote page to kernel. */
- map[op].host_addr = kvaddr;
- map[op].dom = ID_TO_DOM(req->id);
- map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]);
- map[op].flags = GNTMAP_host_map;
- /* This needs a bit more thought in terms of interposition:
- * If we want to be able to modify pages during write using
- * grant table mappings, the guest will either need to allow
- * it, or we'll need to incur a copy. */
- if (req->operation == BLKIF_OP_WRITE)
- map[op].flags |= GNTMAP_readonly;
- op++;
-
- /* Now map it to user. */
- ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
- if (ret)
- {
- DPRINTK("Couldn't get a pte addr!\n");
- goto fail;
- }
-
- map[op].host_addr = ptep;
- map[op].dom = ID_TO_DOM(req->id);
- map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]);
- map[op].flags = GNTMAP_host_map | GNTMAP_application_map
- | GNTMAP_contains_pte;
- /* Above interposition comment applies here as well. */
- if (req->operation == BLKIF_OP_WRITE)
- map[op].flags |= GNTMAP_readonly;
- op++;
- }
-
- if ( unlikely(HYPERVISOR_grant_table_op(
- GNTTABOP_map_grant_ref, map, op)))
- BUG();
-
- op = 0;
- for (i=0; i<(target->nr_segments*2); i+=2) {
- unsigned long uvaddr;
- unsigned long kvaddr;
- unsigned long offset;
- int cancel = 0;
-
- uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2);
- kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2);
-
- if ( unlikely(map[i].handle < 0) ) {
- DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
- ret = map[i].handle;
- cancel = 1;
- }
-
- if ( unlikely(map[i+1].handle < 0) ) {
- DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
- ret = map[i+1].handle;
- cancel = 1;
- }
-
- if (cancel)
- goto fail;
-
- /* Set the necessary mappings in p2m and in the VM_FOREIGN
- * vm_area_struct to allow user vaddr -> struct page lookups
- * to work. This is needed for direct IO to foreign pages. */
- phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] =
- FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
-
- offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
- ((struct page **)blktap_vma->vm_private_data)[offset] =
- pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
-
- /* Save handles for unmapping later. */
- pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle;
- pending_handle(ID_TO_IDX(req->id), i/2).user = map[i+1].handle;
- }
-
-#else
-
- remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
-
- for (i=0; i<target->nr_segments; i++) {
- unsigned long buf;
- unsigned long uvaddr;
- unsigned long kvaddr;
- unsigned long offset;
- unsigned long ptep;
-
- buf = target->frame_and_sects[i] & PAGE_MASK;
- uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
- kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
-
- MULTI_update_va_mapping_otherdomain(
- mcl+i,
- kvaddr,
- pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)),
- 0,
- ID_TO_DOM(req->id));
-
- phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
- FOREIGN_FRAME(buf >> PAGE_SHIFT);
-
- ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
- if (ret)
- {
- DPRINTK("error getting pte\n");
- goto fail;
- }
-
- mmu[i].ptr = ptep;
- mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
- | pgprot_val(blktap_vma->vm_page_prot);
-
- offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
- ((struct page **)blktap_vma->vm_private_data)[offset] =
- pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
- }
-
- /* Add the mmu_update call. */
- mcl[i].op = __HYPERVISOR_mmu_update;
- mcl[i].args[0] = (unsigned long)mmu;
- mcl[i].args[1] = target->nr_segments;
- mcl[i].args[2] = 0;
- mcl[i].args[3] = ID_TO_DOM(req->id);
-
- BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0);
-
- /* Make sure it all worked. */
- for ( i = 0; i < target->nr_segments; i++ )
- {
- if ( unlikely(mcl[i].result != 0) )
- {
- DPRINTK("invalid buffer -- could not remap it\n");
- ret = mcl[i].result;
- goto fail;
- }
- }
- if ( unlikely(mcl[i].result != 0) )
- {
- DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
- ret = mcl[i].result;
- goto fail;
- }
-#endif /* CONFIG_XEN_BLKDEV_GRANT */
-
- /* Mark mapped pages as reserved: */
- for ( i = 0; i < target->nr_segments; i++ )
- {
- unsigned long kvaddr;
-
- kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
- SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
- }
-
-
- blktap_ufe_ring.req_prod_pvt++;
-
- return 0;
-
- fail:
- fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
- return ret;
-}
-
-int blktap_write_be_ring(blkif_response_t *rsp)
-{
- blkif_response_t *target;
-
- /*
- * This is called to pass a request from the real backend domain's
- * blkif ring to the character device.
- */
-
- if ( ! blktap_ring_ok ) {
- DPRINTK("blktap: be_ring not ready for a request!\n");
- return 0;
- }
-
- /* No test for fullness in the response direction. */
-
- target = RING_GET_RESPONSE(&blktap_ube_ring,
- blktap_ube_ring.rsp_prod_pvt);
- memcpy(target, rsp, sizeof(*rsp));
-
- /* no mapping -- pages were mapped in blktap_write_fe_ring() */
-
- blktap_ube_ring.rsp_prod_pvt++;
-
- return 0;
-}
-
-static int blktap_read_fe_ring(void)
-{
- /* This is called to read responses from the UFE ring. */
-
- RING_IDX i, j, rp;
- blkif_response_t *resp_s;
- blkif_t *blkif;
- active_req_t *ar;
-
- DPRINTK("blktap_read_fe_ring()\n");
-
- /* if we are forwarding from UFERring to FERing */
- if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
-
- /* for each outstanding message on the UFEring */
- rp = blktap_ufe_ring.sring->rsp_prod;
- rmb();
-
- for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ )
- {
- resp_s = RING_GET_RESPONSE(&blktap_ufe_ring, i);
-
- DPRINTK("resp->fe_ring\n");
- ar = lookup_active_req(ID_TO_IDX(resp_s->id));
- blkif = ar->blkif;
- for (j = 0; j < ar->nr_pages; j++) {
- unsigned long vaddr;
- struct page **map = blktap_vma->vm_private_data;
- int offset;
-
- vaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j);
- offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
-
- ClearPageReserved(virt_to_page(vaddr));
- map[offset] = NULL;
- }
-
- fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
- zap_page_range(blktap_vma,
- MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0),
- ar->nr_pages << PAGE_SHIFT, NULL);
- write_resp_to_fe_ring(blkif, resp_s);
- blktap_ufe_ring.rsp_cons = i + 1;
- kick_fe_domain(blkif);
- }
- }
- return 0;
-}
-
-static int blktap_read_be_ring(void)
-{
- /* This is called to read requests from the UBE ring. */
-
- RING_IDX i, rp;
- blkif_request_t *req_s;
-
- DPRINTK("blktap_read_be_ring()\n");
-
- /* if we are forwarding from UFERring to FERing */
- if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
-
- /* for each outstanding message on the UFEring */
- rp = blktap_ube_ring.sring->req_prod;
- rmb();
- for ( i = blktap_ube_ring.req_cons; i != rp; i++ )
- {
- req_s = RING_GET_REQUEST(&blktap_ube_ring, i);
-
- DPRINTK("req->be_ring\n");
- write_req_to_be_ring(req_s);
- kick_be_domain();
- }
-
- blktap_ube_ring.req_cons = i;
- }
-
- return 0;
-}
-
-int blktap_write_ctrl_ring(ctrl_msg_t *msg)
-{
- ctrl_msg_t *target;
-
- if ( ! blktap_ring_ok ) {
- DPRINTK("blktap: be_ring not ready for a request!\n");
- return 0;
- }
-
- /* No test for fullness in the response direction. */
-
- target = RING_GET_REQUEST(&blktap_uctrl_ring,
- blktap_uctrl_ring.req_prod_pvt);
- memcpy(target, msg, sizeof(*msg));
-
- blktap_uctrl_ring.req_prod_pvt++;
-
- /* currently treat the ring as unidirectional. */
- blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod;
-
- return 0;
-
-}
-
-/* -------[ blktap module setup ]------------------------------------- */
-
-static struct miscdevice blktap_miscdev = {
- .minor = BLKTAP_MINOR,
- .name = "blktap",
- .fops = &blktap_fops,
- .devfs_name = "misc/blktap",
-};
-
-int blktap_init(void)
-{
- int err, i, j;
- struct page *page;
-
- page = balloon_alloc_empty_page_range(MMAP_PAGES);
- BUG_ON(page == NULL);
- mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-
-#ifdef CONFIG_XEN_BLKDEV_GRANT
- for (i=0; i<MAX_PENDING_REQS ; i++)
- for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
- BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
-#endif
-
- err = misc_register(&blktap_miscdev);
- if ( err != 0 )
- {
- printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
- return err;
- }
-
- init_waitqueue_head(&blktap_wait);
-
-
- return 0;
-}
--- /dev/null
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/evtchn.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/io/blkif.h>
+#include <asm-xen/xen-public/io/ring.h>
+#include <asm-xen/gnttab.h>
+
+#if 0
+#define ASSERT(_p) \
+ if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+struct vbd {
+ blkif_vdev_t handle; /* what the domain refers to this vbd as */
+ unsigned char readonly; /* Non-zero -> read-only */
+ unsigned char type; /* VDISK_xxx */
+ blkif_pdev_t pdevice; /* phys device that this vbd maps to */
+ struct block_device *bdev;
+};
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned long shmem_frame;
+ unsigned int evtchn;
+ unsigned int remote_evtchn;
+ /* Comms information. */
+ blkif_back_ring_t blk_ring;
+ /* VBDs attached to this interface. */
+ struct vbd vbd;
+ /* Private fields. */
+ enum { DISCONNECTED, CONNECTED } status;
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+ /* Is this a blktap frontend */
+ unsigned int is_blktap;
+#endif
+ struct list_head blkdev_list;
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ struct work_struct free_work;
+ u16 shmem_handle;
+ unsigned long shmem_vaddr;
+ grant_ref_t shmem_ref;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *alloc_blkif(domid_t domid);
+void free_blkif_callback(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+ free_blkif_callback(_b); \
+ } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, blkif_pdev_t pdevice,
+ int readonly);
+void vbd_free(struct vbd *vbd);
+
+unsigned long vbd_size(struct vbd *vbd);
+unsigned int vbd_info(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+ unsigned short dev;
+ unsigned short nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <asm-xen/evtchn.h>
+
+static kmem_cache_t *blkif_cachep;
+
+blkif_t *alloc_blkif(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->status = DISCONNECTED;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+
+ return blkif;
+}
+
+static int map_frontend_page(blkif_t *blkif, unsigned long localaddr,
+ unsigned long shared_page)
+{
+ struct gnttab_map_grant_ref op;
+ op.host_addr = localaddr;
+ op.flags = GNTMAP_host_map;
+ op.ref = shared_page;
+ op.dom = blkif->domid;
+
+ BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
+
+ if (op.handle < 0) {
+ DPRINTK(" Grant table operation failure !\n");
+ return op.handle;
+ }
+
+ blkif->shmem_ref = shared_page;
+ blkif->shmem_handle = op.handle;
+ blkif->shmem_vaddr = localaddr;
+ return 0;
+}
+
+static void unmap_frontend_page(blkif_t *blkif)
+{
+ struct gnttab_unmap_grant_ref op;
+
+ op.host_addr = blkif->shmem_vaddr;
+ op.handle = blkif->shmem_handle;
+ op.dev_bus_addr = 0;
+ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+}
+
+int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
+{
+ struct vm_struct *vma;
+ blkif_sring_t *sring;
+ evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
+ int err;
+
+ BUG_ON(blkif->remote_evtchn);
+
+ if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+ return -ENOMEM;
+
+ err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page);
+ if (err) {
+ vfree(vma->addr);
+ return err;
+ }
+
+ op.u.bind_interdomain.dom1 = DOMID_SELF;
+ op.u.bind_interdomain.dom2 = blkif->domid;
+ op.u.bind_interdomain.port1 = 0;
+ op.u.bind_interdomain.port2 = evtchn;
+ err = HYPERVISOR_event_channel_op(&op);
+ if (err) {
+ unmap_frontend_page(blkif);
+ vfree(vma->addr);
+ return err;
+ }
+
+ blkif->evtchn = op.u.bind_interdomain.port1;
+ blkif->remote_evtchn = evtchn;
+
+ sring = (blkif_sring_t *)vma->addr;
+ SHARED_RING_INIT(sring);
+ BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
+
+ bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend",
+ blkif);
+ blkif->status = CONNECTED;
+ blkif->shmem_frame = shared_page;
+
+ return 0;
+}
+
+static void free_blkif(void *arg)
+{
+ evtchn_op_t op = { .cmd = EVTCHNOP_close };
+ blkif_t *blkif = (blkif_t *)arg;
+
+ op.u.close.port = blkif->evtchn;
+ op.u.close.dom = DOMID_SELF;
+ HYPERVISOR_event_channel_op(&op);
+ op.u.close.port = blkif->remote_evtchn;
+ op.u.close.dom = blkif->domid;
+ HYPERVISOR_event_channel_op(&op);
+
+ if (blkif->evtchn)
+ unbind_evtchn_from_irqhandler(blkif->evtchn, blkif);
+
+ if (blkif->blk_ring.sring) {
+ unmap_frontend_page(blkif);
+ vfree(blkif->blk_ring.sring);
+ blkif->blk_ring.sring = NULL;
+ }
+
+ kmem_cache_free(blkif_cachep, blkif);
+}
+
+void free_blkif_callback(blkif_t *blkif)
+{
+ INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
+ schedule_work(&blkif->free_work);
+}
+
+void __init blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+ 0, 0, NULL, NULL);
+}
--- /dev/null
+/* Xenbus code for blkif tap
+
+ A Warfield.
+
+ Hastily modified from the oroginal backend code:
+
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <asm-xen/xenbus.h>
+#include "common.h"
+
+struct backend_info
+{
+ struct xenbus_device *dev;
+
+ /* our communications channel */
+ blkif_t *blkif;
+
+ long int frontend_id;
+
+ /* watch back end for changes */
+ struct xenbus_watch backend_watch;
+
+ /* watch front end for changes */
+ struct xenbus_watch watch;
+ char *frontpath;
+};
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev->data;
+
+ if (be->watch.node)
+ unregister_xenbus_watch(&be->watch);
+ unregister_xenbus_watch(&be->backend_watch);
+ if (be->blkif)
+ blkif_put(be->blkif);
+ if (be->frontpath)
+ kfree(be->frontpath);
+ kfree(be);
+ return 0;
+}
+
+/* Front end tells us frame. */
+static void frontend_changed(struct xenbus_watch *watch, const char *node)
+{
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ int err;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, watch);
+
+ /* If other end is gone, delete ourself. */
+ if (node && !xenbus_exists(be->frontpath, "")) {
+ xenbus_rm(be->dev->nodename, "");
+ device_unregister(&be->dev->dev);
+ return;
+ }
+ if (be->blkif == NULL || be->blkif->status == CONNECTED)
+ return;
+
+ err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref,
+ "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_error(be->dev, err,
+ "reading %s/ring-ref and event-channel",
+ be->frontpath);
+ return;
+ }
+
+ /* Map the shared frame, irq etc. */
+ err = blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ goto abort;
+ }
+
+ xenbus_dev_ok(be->dev);
+
+ return;
+
+abort:
+ xenbus_transaction_end(1);
+}
+
+/*
+ Setup supplies physical device.
+ We provide event channel and device details to front end.
+ Frontend supplies shared frame and event channel.
+ */
+static void backend_changed(struct xenbus_watch *watch, const char *node)
+{
+ int err;
+ char *p;
+ long int handle;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+
+ if (be->blkif == NULL) {
+ /* Front end dir is a number, which is used as the handle. */
+ p = strrchr(be->frontpath, '/') + 1;
+ handle = simple_strtoul(p, NULL, 0);
+
+ be->blkif = alloc_blkif(be->frontend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_error(dev, err, "creating block interface");
+ return;
+ }
+
+ /* Pass in NULL node to skip exist test. */
+ frontend_changed(&be->watch, NULL);
+ }
+}
+
+static int blkback_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ struct backend_info *be;
+ char *frontend;
+ int err;
+
+ be = kmalloc(sizeof(*be), GFP_KERNEL);
+ if (!be) {
+ xenbus_dev_error(dev, -ENOMEM, "allocating backend structure");
+ return -ENOMEM;
+ }
+ memset(be, 0, sizeof(*be));
+
+ frontend = NULL;
+ err = xenbus_gather(dev->nodename,
+ "frontend-id", "%li", &be->frontend_id,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (XENBUS_EXIST_ERR(err))
+ goto free_be;
+ if (err < 0) {
+ xenbus_dev_error(dev, err,
+ "reading %s/frontend or frontend-id",
+ dev->nodename);
+ goto free_be;
+ }
+ if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) {
+ /* If we can't get a frontend path and a frontend-id,
+ * then our bus-id is no longer valid and we need to
+ * destroy the backend device.
+ */
+ err = -ENOENT;
+ goto free_be;
+ }
+
+ be->dev = dev;
+ be->backend_watch.node = dev->nodename;
+ be->backend_watch.callback = backend_changed;
+ err = register_xenbus_watch(&be->backend_watch);
+ if (err) {
+ be->backend_watch.node = NULL;
+ xenbus_dev_error(dev, err, "adding backend watch on %s",
+ dev->nodename);
+ goto free_be;
+ }
+
+ be->frontpath = frontend;
+ be->watch.node = be->frontpath;
+ be->watch.callback = frontend_changed;
+ err = register_xenbus_watch(&be->watch);
+ if (err) {
+ be->watch.node = NULL;
+ xenbus_dev_error(dev, err,
+ "adding frontend watch on %s",
+ be->frontpath);
+ goto free_be;
+ }
+
+ dev->data = be;
+
+ backend_changed(&be->backend_watch, dev->nodename);
+ return 0;
+
+ free_be:
+ if (be->backend_watch.node)
+ unregister_xenbus_watch(&be->backend_watch);
+ if (frontend)
+ kfree(frontend);
+ kfree(be);
+ return err;
+}
+
+static struct xenbus_device_id blkback_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+static struct xenbus_driver blkback = {
+ .name = "vbd",
+ .owner = THIS_MODULE,
+ .ids = blkback_ids,
+ .probe = blkback_probe,
+ .remove = blkback_remove,
+};
+
+void blkif_xenbus_init(void)
+{
+ xenbus_register_backend(&blkback);
+}
i++;
start += PAGE_SIZE;
len--;
-printk(KERN_ALERT "HIT 0x%lx\n", start);
continue;
}
-else printk(KERN_ALERT "MISS 0x%lx\n", start);
}
if (!vma || (vma->vm_flags & VM_IO)
include $(XEN_ROOT)/tools/Rules.mk
SUBDIRS :=
-SUBDIRS += parallax
+SUBDIRS += ublkback
+#SUBDIRS += parallax
BLKTAP_INSTALL_DIR = /usr/sbin
INSTALL_PROG = $(INSTALL) -m0755
INSTALL_DIR = $(INSTALL) -d -m0755
-INCLUDES += -I. -I $(XEN_LIBXC)
+INCLUDES += -I. -I $(XEN_LIBXC) -I $(XEN_XENSTORE)
LIBS := -lpthread -lz
SRCS :=
-SRCS += blktaplib.c
+SRCS += blktaplib.c xenbus.c blkif.c
CFLAGS += -Wall
CFLAGS += -Werror
CFLAGS += -g3
CFLAGS += -fno-strict-aliasing
CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# get asprintf():
+CFLAGS += -D _GNU_SOURCE
# Get gcc to generate the dependencies for us.
CFLAGS += -Wp,-MD,.$(@F).d
CFLAGS += $(INCLUDES)
DEPS = .*.d
OBJS = $(patsubst %.c,%.o,$(SRCS))
-IBINS = blkdump
+IBINS :=
+#IBINS += blkdump
LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
-all: mk-symlinks libblktap.so blkdump
+all: mk-symlinks libblktap.so #blkdump
@set -e; for subdir in $(SUBDIRS); do \
$(MAKE) -C $$subdir $@; \
done
$(INSTALL_DIR) -p $(DESTDIR)/usr/include
$(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
$(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
- $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
+ #$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
@set -e; for subdir in $(SUBDIRS); do \
$(MAKE) -C $$subdir $@; \
done
mv staging/i386/*.rpm .
rm -rf staging
-libblktap.so: $(OBJS)
- $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o \
- libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+libblktap.so: $(OBJS)
+ $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared \
+ -L$(XEN_XENSTORE) -l xenstore \
+ -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
ln -sf libblktap.so.$(MAJOR) $@
blkdump: libblktap.so
- $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
+ $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. \
+ -l blktap blkdump.c
.PHONY: TAGS clean install mk-symlinks rpm
--- /dev/null
+The blktap has been rewritten substantially based on the current
+blkback driver. I've removed passthrough support, as this is broken
+by the move to grant tables and the lack of transitive grants. A
+blktap VM is now only capable of terminating block requests in
+userspace.
+
+ublkback/ contains a _very_ initial cut at a user-level version of the block
+backend driver. It gives a working example of how the current tap
+interfaces are used, in particular w.r.t. the vbd directories in
+xenstore.
+
+parallax/ contains fairly recent parallax code. This does not run on
+the changed blktap interface, but should only be a couple of hours
+work to get going again.
+
+All of the tricky bits are done, but there is plenty of cleaning to
+do, and the top-level functionality is not here yet. At the moment,
+the daemon ignores the pdev requested by the tools and opens the file
+or device specified by TMP_IMAGE_FILE_NAME in ublkback.c.
+
+TODO:
+1. Fix to allow pdev in the store to specify the device to open.
+2. Add support (to tools as well) to mount arbitrary files...
+ just write the filename to mount into the store, instead of pdev.
+3. Reeximine blkif refcounting, it is almost certainly broken at the moment.
+ - creating a blkif should take a reference.
+ - each inflight request should take a reference on dequeue in blktaplib
+ - sending responses should drop refs.
+ - blkif should be implicitly freed when refcounts fall to 0.
+4. Modify the parallax req/rsp code as per ublkback to use the new tap
+ interfaces.
+5. Write a front end that allows parallax and normal mounts to coexist
+6. Allow blkback and blktap to run at the same time.
#include <stdio.h>
#include "blktaplib.h"
-int control_print(control_msg_t *msg)
-{
- if (msg->type != CMSG_BLKIF_BE)
- {
- printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
- return 0;
- }
-
- switch(msg->subtype)
- {
- case CMSG_BLKIF_BE_CREATE:
- if ( msg->length != sizeof(blkif_be_create_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
- ((blkif_be_create_t *)msg->msg)->domid,
- ((blkif_be_create_t *)msg->msg)->blkif_handle);
- break;
- case CMSG_BLKIF_BE_DESTROY:
- if ( msg->length != sizeof(blkif_be_destroy_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
- ((blkif_be_destroy_t *)msg->msg)->domid,
- ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
- break;
- case CMSG_BLKIF_BE_CONNECT:
- if ( msg->length != sizeof(blkif_be_connect_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_CONNECT(d:%d,h:%d)\n",
- ((blkif_be_connect_t *)msg->msg)->domid,
- ((blkif_be_connect_t *)msg->msg)->blkif_handle);
- break;
- case CMSG_BLKIF_BE_DISCONNECT:
- if ( msg->length != sizeof(blkif_be_disconnect_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_DISCONNECT(d:%d,h:%d)\n",
- ((blkif_be_disconnect_t *)msg->msg)->domid,
- ((blkif_be_disconnect_t *)msg->msg)->blkif_handle);
- break;
- case CMSG_BLKIF_BE_VBD_CREATE:
- if ( msg->length != sizeof(blkif_be_vbd_create_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_CREATE(d:%d,h:%d,v:%d)\n",
- ((blkif_be_vbd_create_t *)msg->msg)->domid,
- ((blkif_be_vbd_create_t *)msg->msg)->blkif_handle,
- ((blkif_be_vbd_create_t *)msg->msg)->vdevice);
- break;
- case CMSG_BLKIF_BE_VBD_DESTROY:
- if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
- goto parse_error;
- printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_DESTROY(d:%d,h:%d,v:%d)\n",
- ((blkif_be_vbd_destroy_t *)msg->msg)->domid,
- ((blkif_be_vbd_destroy_t *)msg->msg)->blkif_handle,
- ((blkif_be_vbd_destroy_t *)msg->msg)->vdevice);
- break;
- default:
- goto parse_error;
- }
-
- return 0;
-
-parse_error:
- printf("[CONTROL_MSG] Bad message type or length!\n");
- return 0;
-}
-
int request_print(blkif_request_t *req)
{
int i;
unsigned long fas;
- if ( req->operation == BLKIF_OP_PROBE ) {
- printf("[%2u:%2u<%s]\n", ID_TO_DOM(req->id), ID_TO_IDX(req->id),
- blkif_op_name[req->operation]);
- return BLKTAP_PASS;
- } else {
+ if ( (req->operation == BLKIF_OP_READ) ||
+ (req->operation == BLKIF_OP_WRITE) )
+ {
printf("[%2u:%2u<%5s] (nr_segs: %03u, dev: %03u, %010llu)\n",
ID_TO_DOM(req->id), ID_TO_IDX(req->id),
blkif_op_name[req->operation],
- req->nr_segments, req->device,
+ req->nr_segments, req->handle,
req->sector_number);
);
}
+ } else {
+ printf("Unknown request message type.\n");
}
return BLKTAP_PASS;
int response_print(blkif_response_t *rsp)
{
- if ( rsp->operation == BLKIF_OP_PROBE ) {
- printf("[%2u:%2u>%s]\n", ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id),
- blkif_op_name[rsp->operation]);
- return BLKTAP_PASS;
- } else {
+ if ( (rsp->operation == BLKIF_OP_READ) ||
+ (rsp->operation == BLKIF_OP_WRITE) )
+ {
printf("[%2u:%2u>%5s] (status: %d)\n",
ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id),
blkif_op_name[rsp->operation],
rsp->status);
+ } else {
+ printf("Unknown request message type.\n");
}
return BLKTAP_PASS;
}
int main(int argc, char *argv[])
{
- blktap_register_ctrl_hook("control_print", control_print);
blktap_register_request_hook("request_print", request_print);
blktap_register_response_hook("response_print", response_print);
blktap_listen();
--- /dev/null
+/*
+ * blkif.c
+ *
+ * The blkif interface for blktap. A blkif describes an in-use virtual disk.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <err.h>
+
+#include "blktaplib.h"
+
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static blkif_t *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif != NULL) &&
+ ((blkif->domid != domid) || (blkif->handle != handle)) )
+ blkif = blkif->hash_next;
+ return blkif;
+}
+
+blkif_t *alloc_blkif(domid_t domid)
+{
+ blkif_t *blkif;
+
+ blkif = (blkif_t *)malloc(sizeof(blkif_t));
+ if (!blkif)
+ return NULL;
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+
+ return blkif;
+}
+
+static int (*new_blkif_hook)(blkif_t *blkif) = NULL;
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif))
+{
+ new_blkif_hook = fn;
+}
+
+int blkif_init(blkif_t *blkif, long int handle, long int pdev,
+ long int readonly)
+{
+ domid_t domid;
+ blkif_t **pblkif;
+
+ if (blkif == NULL)
+ return -EINVAL;
+
+ domid = blkif->domid;
+ blkif->handle = handle;
+ blkif->pdev = pdev;
+ blkif->readonly = readonly;
+
+ /*
+ * Call out to the new_blkif_hook. The tap application should define this,
+ * and it should return having set blkif->ops
+ *
+ */
+ if (new_blkif_hook == NULL)
+ {
+ warn("Probe detected a new blkif, but no new_blkif_hook!");
+ return -1;
+ }
+ new_blkif_hook(blkif);
+
+ /* Now wire it in. */
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( *pblkif != NULL )
+ {
+ if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+ {
+ DPRINTF("Could not create blkif: already exists\n");
+ return -1;
+ }
+ pblkif = &(*pblkif)->hash_next;
+ }
+ blkif->hash_next = NULL;
+ *pblkif = blkif;
+
+ return 0;
+}
+
+void free_blkif(blkif_t *blkif)
+{
+ blkif_t **pblkif, *curs;
+
+ pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)];
+ while ( (curs = *pblkif) != NULL )
+ {
+ if ( blkif == curs )
+ {
+ *pblkif = curs->hash_next;
+ }
+ pblkif = &curs->hash_next;
+ }
+ if (blkif != NULL)
+ free(blkif);
+}
+
+void blkif_register_request_hook(blkif_t *blkif, char *name,
+ int (*rh)(blkif_t *, blkif_request_t *, int))
+{
+ request_hook_t *rh_ent, **c;
+
+ rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t));
+ if (!rh_ent)
+ {
+ warn("couldn't allocate a new hook");
+ return;
+ }
+
+ rh_ent->func = rh;
+ rh_ent->next = NULL;
+ if (asprintf(&rh_ent->name, "%s", name) == -1)
+ {
+ free(rh_ent);
+ warn("couldn't allocate a new hook name");
+ return;
+ }
+
+ c = &blkif->request_hook_chain;
+ while (*c != NULL) {
+ c = &(*c)->next;
+ }
+ *c = rh_ent;
+}
+
+void blkif_register_response_hook(blkif_t *blkif, char *name,
+ int (*rh)(blkif_t *, blkif_response_t *, int))
+{
+ response_hook_t *rh_ent, **c;
+
+ rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t));
+ if (!rh_ent)
+ {
+ warn("couldn't allocate a new hook");
+ return;
+ }
+
+ rh_ent->func = rh;
+ rh_ent->next = NULL;
+ if (asprintf(&rh_ent->name, "%s", name) == -1)
+ {
+ free(rh_ent);
+ warn("couldn't allocate a new hook name");
+ return;
+ }
+
+ c = &blkif->response_hook_chain;
+ while (*c != NULL) {
+ c = &(*c)->next;
+ }
+ *c = rh_ent;
+}
+
+void blkif_print_hooks(blkif_t *blkif)
+{
+ request_hook_t *req_hook;
+ response_hook_t *rsp_hook;
+
+ DPRINTF("Request Hooks:\n");
+ req_hook = blkif->request_hook_chain;
+ while (req_hook != NULL)
+ {
+ DPRINTF(" [0x%p] %s\n", req_hook->func, req_hook->name);
+ req_hook = req_hook->next;
+ }
+
+ DPRINTF("Response Hooks:\n");
+ rsp_hook = blkif->response_hook_chain;
+ while (rsp_hook != NULL)
+ {
+ DPRINTF(" [0x%p] %s\n", rsp_hook->func, rsp_hook->name);
+ rsp_hook = rsp_hook->next;
+ }
+}
+
+
+long int vbd_size(blkif_t *blkif)
+{
+ return 1000000000;
+}
+
+long int vbd_secsize(blkif_t *blkif)
+{
+ return 512;
+}
+
+unsigned vbd_info(blkif_t *blkif)
+{
+ return 0;
+}
+
+
+void __init_blkif(void)
+{
+ memset(blkif_hash, 0, sizeof(blkif_hash));
+}
#include <string.h>
#include <unistd.h>
#include <pthread.h>
-
+#include <xs.h>
#define __COMPILING_BLKTAP_LIB
#include "blktaplib.h"
#else
#define DPRINTF(_f, _a...) ((void)0)
#endif
-#define DEBUG_RING_IDXS 1
+#define DEBUG_RING_IDXS 0
#define POLLRDNORM 0x040
#define BLKTAP_IOCTL_KICK 1
+
void got_sig_bus();
void got_sig_int();
/* in kernel these are opposite, but we are a consumer now. */
blkif_back_ring_t fe_ring; /* slightly counterintuitive ;) */
blkif_front_ring_t be_ring;
-ctrl_back_ring_t ctrl_ring;
unsigned long mmap_vstart = 0;
char *blktap_mem;
int fd = 0;
-#define BLKTAP_RING_PAGES 3 /* Ctrl, Back, Front */
-/*#define BLKTAP_MMAP_PAGES ((11 + 1) * 64)*/
-#define BLKTAP_MMAP_PAGES \
- ((BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) * BLKIF_RING_SIZE)
-#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + BLKTAP_MMAP_PAGES)
+#define BLKTAP_RING_PAGES 1 /* Front */
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES)
int bad_count = 0;
void bad(void)
}
inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
-/*
+
static int (*request_hook)(blkif_request_t *req) = NULL;
static int (*response_hook)(blkif_response_t *req) = NULL;
-*/
-
-/*-----[ Request/Response hook chains.]----------------------------------*/
-
-#define HOOK_NAME_MAX 50
-
-typedef struct ctrl_hook_st {
- char name[HOOK_NAME_MAX];
- int (*func)(control_msg_t *);
- struct ctrl_hook_st *next;
-} ctrl_hook_t;
-
-typedef struct request_hook_st {
- char name[HOOK_NAME_MAX];
- int (*func)(blkif_request_t *);
- struct request_hook_st *next;
-} request_hook_t;
-
-typedef struct response_hook_st {
- char name[HOOK_NAME_MAX];
- int (*func)(blkif_response_t *);
- struct response_hook_st *next;
-} response_hook_t;
-
-static ctrl_hook_t *ctrl_hook_chain = NULL;
-static request_hook_t *request_hook_chain = NULL;
-static response_hook_t *response_hook_chain = NULL;
-
-void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *))
-{
- ctrl_hook_t *ch_ent, **c;
-
- ch_ent = (ctrl_hook_t *)malloc(sizeof(ctrl_hook_t));
- if (!ch_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-
- ch_ent->func = ch;
- ch_ent->next = NULL;
- strncpy(ch_ent->name, name, HOOK_NAME_MAX);
- ch_ent->name[HOOK_NAME_MAX-1] = '\0';
-
- c = &ctrl_hook_chain;
- while (*c != NULL) {
- c = &(*c)->next;
- }
- *c = ch_ent;
-}
-
-void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *))
-{
- request_hook_t *rh_ent, **c;
-
- rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t));
- if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-
- rh_ent->func = rh;
- rh_ent->next = NULL;
- strncpy(rh_ent->name, name, HOOK_NAME_MAX);
-
- c = &request_hook_chain;
- while (*c != NULL) {
- c = &(*c)->next;
- }
- *c = rh_ent;
-}
-
-void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *))
-{
- response_hook_t *rh_ent, **c;
-
- rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t));
- if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
-
- rh_ent->func = rh;
- rh_ent->next = NULL;
- strncpy(rh_ent->name, name, HOOK_NAME_MAX);
-
- c = &response_hook_chain;
- while (*c != NULL) {
- c = &(*c)->next;
- }
- *c = rh_ent;
-}
-
-void print_hooks(void)
-{
- request_hook_t *req_hook;
- response_hook_t *rsp_hook;
- ctrl_hook_t *ctrl_hook;
-
- DPRINTF("Control Hooks:\n");
- ctrl_hook = ctrl_hook_chain;
- while (ctrl_hook != NULL)
- {
- DPRINTF(" [0x%p] %s\n", ctrl_hook->func, ctrl_hook->name);
- ctrl_hook = ctrl_hook->next;
- }
-
- DPRINTF("Request Hooks:\n");
- req_hook = request_hook_chain;
- while (req_hook != NULL)
- {
- DPRINTF(" [0x%p] %s\n", req_hook->func, req_hook->name);
- req_hook = req_hook->next;
- }
-
- DPRINTF("Response Hooks:\n");
- rsp_hook = response_hook_chain;
- while (rsp_hook != NULL)
- {
- DPRINTF(" [0x%p] %s\n", rsp_hook->func, rsp_hook->name);
- rsp_hook = rsp_hook->next;
- }
-}
/*-----[ Data to/from Backend (server) VM ]------------------------------*/
-
+/*
inline int write_req_to_be_ring(blkif_request_t *req)
{
return 0;
}
+*/
inline int write_rsp_to_fe_ring(blkif_response_t *rsp)
{
return 0;
}
-static void apply_rsp_hooks(blkif_response_t *rsp)
+static void apply_rsp_hooks(blkif_t *blkif, blkif_response_t *rsp)
{
response_hook_t *rsp_hook;
- rsp_hook = response_hook_chain;
+ rsp_hook = blkif->response_hook_chain;
while (rsp_hook != NULL)
{
- switch(rsp_hook->func(rsp))
+ switch(rsp_hook->func(blkif, rsp, 1))
{
case BLKTAP_PASS:
break;
}
}
+
static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER;
-void blktap_inject_response(blkif_response_t *rsp)
+void blkif_inject_response(blkif_t *blkif, blkif_response_t *rsp)
{
- apply_rsp_hooks(rsp);
-
+ apply_rsp_hooks(blkif, rsp);
+
write_rsp_to_fe_ring(rsp);
-
+}
+
+void blktap_kick_responses(void)
+{
pthread_mutex_lock(&push_mutex);
RING_PUSH_RESPONSES(&fe_ring);
int active;
} pollhook_t;
-static struct pollfd pfd[MAX_POLLFDS+1];
+static struct pollfd pfd[MAX_POLLFDS+2]; /* tap and store are extra */
static pollhook_t pollhooks[MAX_POLLFDS];
static unsigned int ph_freelist[MAX_POLLFDS];
static unsigned int ph_cons, ph_prod;
int blktap_listen(void)
{
- int notify_be, notify_fe, tap_pfd;
-
+ int notify_be, notify_fe, tap_pfd, store_pfd, xs_fd, ret;
+ struct xs_handle *h;
+ blkif_t *blkif;
+
/* comms rings: */
blkif_request_t *req;
blkif_response_t *rsp;
- control_msg_t *msg;
blkif_sring_t *sring;
- ctrl_sring_t *csring;
RING_IDX rp, i, pfd_count;
/* pending rings */
blkif_request_t req_pending[BLKIF_RING_SIZE];
- blkif_response_t rsp_pending[BLKIF_RING_SIZE];
+ /* blkif_response_t rsp_pending[BLKIF_RING_SIZE] */;
/* handler hooks: */
request_hook_t *req_hook;
response_hook_t *rsp_hook;
- ctrl_hook_t *ctrl_hook;
signal (SIGBUS, got_sig_bus);
signal (SIGINT, got_sig_int);
- print_hooks();
-
+ __init_blkif();
+
fd = open("/dev/blktap", O_RDWR);
- if (fd == -1) {
- printf("open failed! (%d)\n", errno);
- goto open_failed;
- }
+ if (fd == -1)
+ err(-1, "open failed!");
blktap_mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if ((int)blktap_mem == -1) {
- printf("mmap failed! (%d)\n", errno);
- goto mmap_failed;
- }
+ if ((int)blktap_mem == -1)
+ err(-1, "mmap failed!");
/* assign the rings to the mapped memory */
- csring = (ctrl_sring_t *)blktap_mem;
- BACK_RING_INIT(&ctrl_ring, csring, PAGE_SIZE);
-
+/*
sring = (blkif_sring_t *)((unsigned long)blktap_mem + PAGE_SIZE);
FRONT_RING_INIT(&be_ring, sring, PAGE_SIZE);
-
- sring = (blkif_sring_t *)((unsigned long)blktap_mem + (2 *PAGE_SIZE));
+*/
+ sring = (blkif_sring_t *)((unsigned long)blktap_mem);
BACK_RING_INIT(&fe_ring, sring, PAGE_SIZE);
mmap_vstart = (unsigned long)blktap_mem +(BLKTAP_RING_PAGES << PAGE_SHIFT);
+
+ /* Set up store connection and watch. */
+ h = xs_daemon_open();
+ if (h == NULL)
+ err(-1, "xs_daemon_open");
+
+ ret = add_blockdevice_probe_watch(h, "Domain-0");
+ if (ret != 0)
+ err(0, "adding device probewatch");
+
ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
while(1) {
int ret;
/* build the poll list */
-
- DPRINTF("Building poll list.\n");
-
pfd_count = 0;
for ( i=0; i < MAX_POLLFDS; i++ ) {
pollhook_t *ph = &pollhooks[i];
}
}
- tap_pfd = pfd_count;
+ tap_pfd = pfd_count++;
pfd[tap_pfd].fd = fd;
pfd[tap_pfd].events = POLLIN;
- DPRINTF("poll() %d fds.\n", pfd_count);
+ store_pfd = pfd_count++;
+ pfd[store_pfd].fd = xs_fileno(h);
+ pfd[store_pfd].events = POLLIN;
- if ( (ret = (poll(pfd, pfd_count+1, 10000)) == 0) ) {
+ if ( (ret = (poll(pfd, pfd_count, 10000)) == 0) ) {
if (DEBUG_RING_IDXS)
ioctl(fd, BLKTAP_IOCTL_PRINT_IDXS);
continue;
}
- DPRINTF("poll returned %d\n", ret);
-
for (i=0; i < MAX_POLLFDS; i++) {
if ( (pollhooks[i].active ) && (pollhooks[i].pfd->revents ) )
pollhooks[i].func(pollhooks[i].pfd->fd);
}
- if (pfd[tap_pfd].revents) {
-
- /* empty the control ring */
- rp = ctrl_ring.sring->req_prod;
- rmb();
- for (i = ctrl_ring.req_cons; i < rp; i++)
- {
- msg = RING_GET_REQUEST(&ctrl_ring, i);
+ if (pfd[store_pfd].revents) {
+ ret = xs_fire_next_watch(h);
+ }
- ctrl_hook = ctrl_hook_chain;
- while (ctrl_hook != NULL)
- {
- DPRINTF("CTRL_HOOK: %s\n", ctrl_hook->name);
- /* We currently don't respond to ctrl messages. */
- ctrl_hook->func(msg);
- ctrl_hook = ctrl_hook->next;
- }
- }
- /* Using this as a unidirectional ring. */
- ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i;
-pthread_mutex_lock(&push_mutex);
- RING_PUSH_RESPONSES(&ctrl_ring);
-pthread_mutex_unlock(&push_mutex);
-
+ if (pfd[tap_pfd].revents)
+ {
/* empty the fe_ring */
notify_fe = 0;
notify_be = RING_HAS_UNCONSUMED_REQUESTS(&fe_ring);
rmb();
for (i = fe_ring.req_cons; i != rp; i++)
{
- int done = 0; /* stop forwarding this request */
+ int done = 0;
req = RING_GET_REQUEST(&fe_ring, i);
memcpy(&req_pending[ID_TO_IDX(req->id)], req, sizeof(*req));
req = &req_pending[ID_TO_IDX(req->id)];
- DPRINTF("copying an fe request\n");
+ blkif = blkif_find_by_handle(ID_TO_DOM(req->id), req->handle);
- req_hook = request_hook_chain;
- while (req_hook != NULL)
+ if (blkif != NULL)
{
- DPRINTF("REQ_HOOK: %s\n", req_hook->name);
- switch(req_hook->func(req))
+ req_hook = blkif->request_hook_chain;
+ while (req_hook != NULL)
{
- case BLKTAP_RESPOND:
- apply_rsp_hooks((blkif_response_t *)req);
- write_rsp_to_fe_ring((blkif_response_t *)req);
- notify_fe = 1;
- done = 1;
- break;
- case BLKTAP_STOLEN:
- done = 1;
- break;
- case BLKTAP_PASS:
- break;
- default:
- printf("Unknown request hook return value!\n");
+ switch(req_hook->func(blkif, req, ((i+1) == rp)))
+ {
+ case BLKTAP_RESPOND:
+ apply_rsp_hooks(blkif, (blkif_response_t *)req);
+ write_rsp_to_fe_ring((blkif_response_t *)req);
+ notify_fe = 1;
+ done = 1;
+ break;
+ case BLKTAP_STOLEN:
+ done = 1;
+ break;
+ case BLKTAP_PASS:
+ break;
+ default:
+ printf("Unknown request hook return value!\n");
+ }
+ if (done) break;
+ req_hook = req_hook->next;
}
- if (done) break;
- req_hook = req_hook->next;
}
- if (done == 0) write_req_to_be_ring(req);
+ if (done == 0)
+ {
+ /* this was: */
+ /* write_req_to_be_ring(req); */
+
+ unsigned long id = req->id;
+ unsigned short operation = req->operation;
+ printf("Unterminated request!\n");
+ rsp = (blkif_response_t *)req;
+ rsp->id = id;
+ rsp->operation = operation;
+ rsp->status = BLKIF_RSP_ERROR;
+ write_rsp_to_fe_ring(rsp);
+ notify_fe = 1;
+ done = 1;
+ }
}
fe_ring.req_cons = i;
/* empty the be_ring */
+/*
notify_fe |= RING_HAS_UNCONSUMED_RESPONSES(&be_ring);
rp = be_ring.sring->rsp_prod;
rmb();
write_rsp_to_fe_ring(rsp);
}
be_ring.rsp_cons = i;
-
+*/
/* notify the domains */
-
+/*
if (notify_be) {
DPRINTF("notifying be\n");
pthread_mutex_lock(&push_mutex);
ioctl(fd, BLKTAP_IOCTL_KICK_BE);
pthread_mutex_unlock(&push_mutex);
}
-
+*/
if (notify_fe) {
DPRINTF("notifying fe\n");
-pthread_mutex_lock(&push_mutex);
+ pthread_mutex_lock(&push_mutex);
RING_PUSH_RESPONSES(&fe_ring);
ioctl(fd, BLKTAP_IOCTL_KICK_FE);
-pthread_mutex_unlock(&push_mutex);
+ pthread_mutex_unlock(&push_mutex);
}
}
}
*
* userland accessors to the block tap.
*
+ * Sept 2/05 -- I'm scaling this back to only support block remappings
+ * to user in a backend domain. Passthrough and interposition can be readded
+ * once transitive grants are available.
*/
#ifndef __BLKTAPLIB_H__
#include <xen/io/blkif.h>
#include <xen/io/ring.h>
#include <xen/io/domain_controller.h>
+#include <xs.h>
/* /dev/xen/blktap resides at device number major=10, minor=202 */
#define BLKTAP_MINOR 202
static inline int BLKTAP_MODE_VALID(unsigned long arg)
{
+ return (
+ ( arg == BLKTAP_MODE_PASSTHROUGH ) ||
+ ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+ ( arg == BLKTAP_MODE_INTERPOSE ) );
+/*
return (
( arg == BLKTAP_MODE_PASSTHROUGH ) ||
( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
);
+*/
}
/* Return values for handling messages in hooks. */
#define BLKTAP_RESPOND 1 /* Request is now a reply. Return it. */
#define BLKTAP_STOLEN 2 /* Hook has stolen request. */
-#define domid_t unsigned short
+//#define domid_t unsigned short
inline unsigned int ID_TO_IDX(unsigned long id);
inline domid_t ID_TO_DOM(unsigned long id);
-void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *));
-void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *));
-void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *));
-void blktap_inject_response(blkif_response_t *);
int blktap_attach_poll(int fd, short events, int (*func)(int));
void blktap_detach_poll(int fd);
int blktap_listen(void);
+struct blkif;
+
+typedef struct request_hook_st {
+ char *name;
+ int (*func)(struct blkif *, blkif_request_t *, int);
+ struct request_hook_st *next;
+} request_hook_t;
+
+typedef struct response_hook_st {
+ char *name;
+ int (*func)(struct blkif *, blkif_response_t *, int);
+ struct response_hook_st *next;
+} response_hook_t;
+
+struct blkif_ops {
+ long int (*get_size)(struct blkif *blkif);
+ long int (*get_secsize)(struct blkif *blkif);
+ unsigned (*get_info)(struct blkif *blkif);
+};
+
+typedef struct blkif {
+ domid_t domid;
+ long int handle;
+
+ long int pdev;
+ long int readonly;
+
+ enum { DISCONNECTED, CONNECTED } state;
+
+ struct blkif_ops *ops;
+ request_hook_t *request_hook_chain;
+ response_hook_t *response_hook_chain;
+
+ struct blkif *hash_next;
+
+ void *prv; /* device-specific data */
+} blkif_t;
+
+void register_new_blkif_hook(int (*fn)(blkif_t *blkif));
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+blkif_t *alloc_blkif(domid_t domid);
+int blkif_init(blkif_t *blkif, long int handle, long int pdev,
+ long int readonly);
+void free_blkif(blkif_t *blkif);
+void __init_blkif(void);
+
+
+/* xenstore/xenbus: */
+extern int add_blockdevice_probe_watch(struct xs_handle *h,
+ const char *domname);
+int xs_fire_next_watch(struct xs_handle *h);
+
+
+void blkif_print_hooks(blkif_t *blkif);
+void blkif_register_request_hook(blkif_t *blkif, char *name,
+ int (*rh)(blkif_t *, blkif_request_t *, int));
+void blkif_register_response_hook(blkif_t *blkif, char *name,
+ int (*rh)(blkif_t *, blkif_response_t *, int));
+void blkif_inject_response(blkif_t *blkif, blkif_response_t *);
+void blktap_kick_responses(void);
+
+/* this must match the underlying driver... */
+#define MAX_PENDING_REQS 64
+
/* Accessing attached data page mappings */
-#define MMAP_PAGES_PER_REQUEST \
- (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_VADDR(_req,_seg) \
- (mmap_vstart + \
- ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg) \
+ (mmap_vstart + \
+ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
((_seg) * PAGE_SIZE))
extern unsigned long mmap_vstart;
-
/* Defines that are only used by library clients */
#ifndef __COMPILING_BLKTAP_LIB
static char *blkif_op_name[] = {
[BLKIF_OP_READ] = "READ",
[BLKIF_OP_WRITE] = "WRITE",
- [BLKIF_OP_PROBE] = "PROBE",
};
#endif /* __COMPILING_BLKTAP_LIB */
--- /dev/null
+/*
+ * list.h
+ *
+ * This is a subset of linux's list.h intended to be used in user-space.
+ *
+ */
+
+#ifndef __LIST_H__
+#define __LIST_H__
+
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+#define list_entry(ptr, type, member) \
+ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#endif /* __LIST_H__ */
--- /dev/null
+
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+INCLUDES += -I..
+
+INSTALL = install
+INSTALL_PROG = $(INSTALL) -m0755
+IBIN = ublkback
+INSTALL_DIR = /usr/sbin
+
+CFLAGS += -Wall
+CFLAGS += -Werror
+CFLAGS += -Wno-unused
+#CFLAGS += -O3
+CFLAGS += -g3
+CFLAGS += -fno-strict-aliasing
+CFLAGS += -I $(XEN_LIBXC)
+CFLAGS += $(INCLUDES) -I.
+CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# Get gcc to generate the dependencies for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+DEPS = .*.d
+
+OBJS = $(patsubst %.c,%.o,$(SRCS))
+
+all: $(IBIN)
+
+LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
+
+install:
+ $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INSTALL_DIR)
+clean:
+ rm -rf *.o*~ $(DEPS) xen TAGS $(IBIN)
+
+ublkback:
+ $(CC) $(CFLAGS) -o ublkback -L$(XEN_LIBXC) -L. -L.. \
+ -lblktap -laio ublkback.c ublkbacklib.c -pg
+
+.PHONY: clean install
+
+-include $(DEPS)
--- /dev/null
+/* ublkback.c
+ *
+ * libaio-based userlevel backend.
+ */
+
+#include "blktaplib.h"
+#include "ublkbacklib.h"
+
+
+int main(int argc, char *argv[])
+{
+ ublkback_init();
+
+ register_new_blkif_hook(ublkback_new_blkif);
+ blktap_listen();
+
+ return 0;
+}
--- /dev/null
+/* ublkbacklib.c
+ *
+ * file/device image-backed block device -- using linux libaio.
+ *
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ *
+ * NOTE: This doesn't work. Grrr.
+ */
+
+#define _GNU_SOURCE
+#define __USE_LARGEFILE64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <db.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <libaio.h>
+#include <pthread.h>
+#include <time.h>
+#include <err.h>
+#include "blktaplib.h"
+
+/* XXXX: */
+/* Current code just mounts this file/device to any requests that come in. */
+//#define TMP_IMAGE_FILE_NAME "/dev/sda1"
+#define TMP_IMAGE_FILE_NAME "fc3.image"
+
+#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */
+#define MAX_SEGMENTS_PER_REQ 11
+#define SECTOR_SHIFT 9
+#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1
+#define ASSERT(_p) \
+ if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+/* Note on pending_reqs: I assume all reqs are queued before they start to
+ * get filled. so count of 0 is an unused record.
+ */
+typedef struct {
+ blkif_request_t req;
+ blkif_t *blkif;
+ int count;
+} pending_req_t;
+
+static pending_req_t pending_list[MAX_REQUESTS];
+static io_context_t ctx;
+static struct iocb *iocb_free[MAX_AIO_REQS];
+static int iocb_free_count;
+
+/* ---[ Notification mecahnism ]--------------------------------------- */
+
+enum {
+ READ = 0,
+ WRITE = 1
+};
+
+static int aio_notify[2];
+static volatile int aio_listening = 0;
+static pthread_mutex_t notifier_sem = PTHREAD_MUTEX_INITIALIZER;
+
+static struct io_event aio_events[MAX_AIO_REQS];
+static int aio_event_count = 0;
+
+/* this is commented out in libaio.h for some reason. */
+extern int io_queue_wait(io_context_t ctx, struct timespec *timeout);
+
+static void *notifier_thread(void *arg)
+{
+ int ret;
+ int msg = 0x00feeb00;
+
+ DPRINTF("Notifier thread started.\n");
+ for (;;) {
+ pthread_mutex_lock(¬ifier_sem);
+ if ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0) {
+ aio_event_count = ret;
+ write(aio_notify[WRITE], &msg, sizeof(msg));
+ } else {
+ printf("[io_queue_wait error! %d]\n", errno);
+ pthread_mutex_unlock(¬ifier_sem);
+ }
+ }
+}
+
+/* --- Talking to xenstore: ------------------------------------------- */
+
+int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done);
+int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done);
+
+typedef struct image {
+ /* These need to turn into an array/rbtree for multi-disk support. */
+ int fd;
+ u64 fsid;
+ blkif_vdev_t vdevice;
+ long int size;
+ long int secsize;
+ long int info;
+} image_t;
+
+long int ublkback_get_size(blkif_t *blkif)
+{
+ image_t *img = (image_t *)blkif->prv;
+ return img->size;
+}
+
+long int ublkback_get_secsize(blkif_t *blkif)
+{
+ image_t *img = (image_t *)blkif->prv;
+ return img->secsize;
+}
+
+unsigned ublkback_get_info(blkif_t *blkif)
+{
+ image_t *img = (image_t *)blkif->prv;
+ return img->info;
+}
+
+static struct blkif_ops ublkback_ops = {
+ get_size: ublkback_get_size,
+ get_secsize: ublkback_get_secsize,
+ get_info: ublkback_get_info,
+};
+
+int ublkback_new_blkif(blkif_t *blkif)
+{
+ image_t *image;
+ struct stat stat;
+ int ret;
+
+ image = (image_t *)malloc(sizeof(image_t));
+ if (image == NULL) {
+ printf("error allocating image record.\n");
+ return -ENOMEM;
+ }
+
+ /* Open it. */
+ image->fd = open(TMP_IMAGE_FILE_NAME,
+ O_RDWR | O_DIRECT | O_LARGEFILE);
+
+ if ((image->fd < 0) && (errno == EINVAL)) {
+ /* Maybe O_DIRECT isn't supported. */
+ warn("open() failed on '%s', trying again without O_DIRECT",
+ TMP_IMAGE_FILE_NAME);
+ image->fd = open(TMP_IMAGE_FILE_NAME, O_RDWR | O_LARGEFILE);
+ }
+
+ if (image->fd < 0) {
+ warn("Couldn't open image file!");
+ free(image);
+ return -EINVAL;
+ }
+
+ /* Size it. */
+ ret = fstat(image->fd, &stat);
+ if (ret != 0) {
+ printf("Couldn't stat image in PROBE!");
+ return -EINVAL;
+ }
+
+ image->size = (stat.st_size >> SECTOR_SHIFT);
+
+ /* TODO: IOCTL to get size of raw device. */
+/*
+ ret = ioctl(img->fd, BLKGETSIZE, &blksize);
+ if (ret != 0) {
+ printf("Couldn't ioctl image in PROBE!\n");
+ goto err;
+ }
+*/
+ if (image->size == 0)
+ image->size =((u64) 16836057);
+ image->secsize = 512;
+ image->info = 0;
+
+ /* Register the hooks */
+ blkif_register_request_hook(blkif, "Ublkback req.", ublkback_request);
+ blkif_register_response_hook(blkif, "Ublkback resp.", ublkback_response);
+
+
+ printf(">X<Created a new blkif! pdev was %ld, but you got %s\n",
+ blkif->pdev, TMP_IMAGE_FILE_NAME);
+
+ blkif->ops = &ublkback_ops;
+ blkif->prv = (void *)image;
+
+ return 0;
+}
+
+
+/* --- Moving the bits: ----------------------------------------------- */
+
+static int batch_count = 0;
+int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done)
+{
+ int fd;
+ u64 sector;
+ char *spage, *dpage;
+ int ret, i, idx;
+ blkif_response_t *rsp;
+ domid_t dom = ID_TO_DOM(req->id);
+ static struct iocb *ioq[MAX_SEGMENTS_PER_REQ*MAX_REQUESTS];
+ static int io_idx = 0;
+ struct iocb *io;
+ image_t *img;
+
+ img = (image_t *)blkif->prv;
+ fd = img->fd;
+
+ switch (req->operation)
+ {
+ case BLKIF_OP_WRITE:
+ {
+ unsigned long size;
+
+
+ batch_count++;
+
+ idx = ID_TO_IDX(req->id);
+ ASSERT(pending_list[idx].count == 0);
+ memcpy(&pending_list[idx].req, req, sizeof(*req));
+ pending_list[idx].count = req->nr_segments;
+ pending_list[idx].blkif = blkif;
+
+ for (i = 0; i < req->nr_segments; i++) {
+
+ sector = req->sector_number + (8*i);
+
+ size = blkif_last_sect (req->frame_and_sects[i]) -
+ blkif_first_sect(req->frame_and_sects[i]) + 1;
+
+ if (blkif_first_sect(req->frame_and_sects[i]) != 0)
+ DPRINTF("iWR: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n",
+ req->sector_number, sector,
+ blkif_first_sect(req->frame_and_sects[i]),
+ blkif_last_sect (req->frame_and_sects[i]),
+ (long)(sector << SECTOR_SHIFT));
+
+ spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+ spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+ /*convert size and sector to byte offsets */
+ size <<= SECTOR_SHIFT;
+ sector <<= SECTOR_SHIFT;
+
+ io = iocb_free[--iocb_free_count];
+ io_prep_pwrite(io, fd, spage, size, sector);
+ io->data = (void *)idx;
+ //ioq[i] = io;
+ ioq[io_idx++] = io;
+ }
+
+ if (batch_done) {
+ ret = io_submit(ctx, io_idx, ioq);
+ batch_count = 0;
+ if (ret < 0)
+ printf("BADNESS: io_submit error! (%d)\n", errno);
+ io_idx = 0;
+ }
+
+ return BLKTAP_STOLEN;
+
+ }
+ case BLKIF_OP_READ:
+ {
+ unsigned long size;
+
+ batch_count++;
+ idx = ID_TO_IDX(req->id);
+ ASSERT(pending_list[idx].count == 0);
+ memcpy(&pending_list[idx].req, req, sizeof(*req));
+ pending_list[idx].count = req->nr_segments;
+ pending_list[idx].blkif = blkif;
+
+ for (i = 0; i < req->nr_segments; i++) {
+
+ sector = req->sector_number + (8*i);
+
+ size = blkif_last_sect (req->frame_and_sects[i]) -
+ blkif_first_sect(req->frame_and_sects[i]) + 1;
+
+ dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+ dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+ if (blkif_first_sect(req->frame_and_sects[i]) != 0)
+ DPRINTF("iRD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) "
+ "pos: %15lu dpage: %p\n",
+ req->sector_number, sector,
+ blkif_first_sect(req->frame_and_sects[i]),
+ blkif_last_sect (req->frame_and_sects[i]),
+ (long)(sector << SECTOR_SHIFT), dpage);
+
+ /*convert size and sector to byte offsets */
+ size <<= SECTOR_SHIFT;
+ sector <<= SECTOR_SHIFT;
+
+
+ /*
+ * NB: Looks like AIO now has non-page aligned support, this path
+ * can probably be removed... Only really used for hunting
+ * superblocks anyway... ;)
+ */
+ if ( ((unsigned long)dpage % PAGE_SIZE) != 0 ) {
+ /* AIO to raw devices must be page aligned, so do this read
+ * synchronously. The OS is probably just looking for
+ * a superblock or something, so this won't hurt performance.
+ */
+ int ret;
+
+ printf("Slow path block read.\n");
+ /* Question: do in-progress aio ops modify the file cursor? */
+ ret = lseek(fd, sector, SEEK_SET);
+ if (ret == (off_t)-1)
+ printf("lseek failed!\n");
+ ret = read(fd, dpage, size);
+ if (ret < 0)
+ printf("read problem (%d)\n", ret);
+ printf("|\n|\n| read: %lld, %lu, %d\n|\n|\n", sector, size, ret);
+
+ /* not an async request any more... */
+ pending_list[idx].count--;
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_READ;
+ rsp->status = BLKIF_RSP_OKAY;
+ return BLKTAP_RESPOND;
+ /* Doh -- need to flush aio if this is end-of-batch */
+ }
+
+ io = iocb_free[--iocb_free_count];
+
+ io_prep_pread(io, fd, dpage, size, sector);
+ io->data = (void *)idx;
+
+ ioq[io_idx++] = io;
+ //ioq[i] = io;
+ }
+
+ if (batch_done) {
+ ret = io_submit(ctx, io_idx, ioq);
+ batch_count = 0;
+ if (ret < 0)
+ printf("BADNESS: io_submit error! (%d)\n", errno);
+ io_idx = 0;
+ }
+
+ return BLKTAP_STOLEN;
+
+ }
+ }
+
+ printf("Unknown block operation!\n");
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = req->operation;
+ rsp->status = BLKIF_RSP_ERROR;
+ return BLKTAP_RESPOND;
+}
+
+
+int ublkback_pollhook(int fd)
+{
+ struct io_event *ep;
+ int n, ret, idx;
+ blkif_request_t *req;
+ blkif_response_t *rsp;
+ int responses_queued = 0;
+ int pages=0;
+
+ for (ep = aio_events; aio_event_count-- > 0; ep++) {
+ struct iocb *io = ep->obj;
+ idx = (int) ep->data;
+
+ if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){
+ printf("invalid index returned(%u)!\n", idx);
+ break;
+ }
+
+ if ((int)ep->res < 0)
+ printf("***\n***aio request error! (%d,%d)\n***\n",
+ (int)ep->res, (int)ep->res2);
+
+ pending_list[idx].count--;
+ iocb_free[iocb_free_count++] = io;
+ pages++;
+
+ if (pending_list[idx].count == 0) {
+ blkif_request_t tmp = pending_list[idx].req;
+ rsp = (blkif_response_t *)&pending_list[idx].req;
+ rsp->id = tmp.id;
+ rsp->operation = tmp.operation;
+ rsp->status = BLKIF_RSP_OKAY;
+ blkif_inject_response(pending_list[idx].blkif, rsp);
+ responses_queued++;
+ }
+ }
+
+ if (responses_queued) {
+ blktap_kick_responses();
+ }
+
+ read(aio_notify[READ], &idx, sizeof(idx));
+ aio_listening = 1;
+ pthread_mutex_unlock(¬ifier_sem);
+
+ return 0;
+}
+
+/* the image library terminates the request stream. _resp is a noop. */
+int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done)
+{
+ return BLKTAP_PASS;
+}
+
+void ublkback_init(void)
+{
+ int i, rc;
+ pthread_t p;
+
+ for (i = 0; i < MAX_REQUESTS; i++)
+ pending_list[i].count = 0;
+
+ memset(&ctx, 0, sizeof(ctx));
+ rc = io_queue_init(MAX_AIO_REQS, &ctx);
+ if (rc != 0) {
+ printf("queue_init failed! (%d)\n", rc);
+ exit(0);
+ }
+
+ for (i=0; i<MAX_AIO_REQS; i++) {
+ if (!(iocb_free[i] = (struct iocb *)malloc(sizeof(struct iocb)))) {
+ printf("error allocating iocb array\n");
+ exit(0);
+ }
+ iocb_free_count = i;
+ }
+
+ rc = pipe(aio_notify);
+ if (rc != 0) {
+ printf("pipe failed! (%d)\n", errno);
+ exit(0);
+ }
+
+ rc = pthread_create(&p, NULL, notifier_thread, NULL);
+ if (rc != 0) {
+ printf("pthread_create failed! (%d)\n", errno);
+ exit(0);
+ }
+
+ aio_listening = 1;
+
+ blktap_attach_poll(aio_notify[READ], POLLIN, ublkback_pollhook);
+}
+
--- /dev/null
+/* blkaiolib.h
+ *
+ * aio image-backed block device.
+ *
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+int ublkback_request(blkif_request_t *req, int batch_done);
+int ublkback_response(blkif_response_t *rsp); /* noop */
+int ublkback_new_blkif(blkif_t *blkif);
+void ublkback_init(void);
--- /dev/null
+/*
+ * xenbus.c
+ *
+ * xenbus interface to the blocktap.
+ *
+ * this handles the top-half of integration with block devices through the
+ * store -- the tap driver negotiates the device channel etc, while the
+ * userland tap clinet needs to sort out the disk parameters etc.
+ *
+ * A. Warfield 2005 Based primarily on the blkback and xenbus driver code.
+ * Comments there apply here...
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <xs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <poll.h>
+#include "blktaplib.h"
+#include "list.h"
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* --- Xenstore / Xenbus helpers ---------------------------------------- */
+/*
+ * These should all be pulled out into the xenstore API. I'm faulting commands
+ * in from the xenbus interface as i need them.
+ */
+
+
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xs_gather(struct xs_handle *xs, const char *dir, ...)
+{
+ va_list ap;
+ const char *name;
+ char *path;
+ int ret = 0;
+
+ va_start(ap, dir);
+ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+ const char *fmt = va_arg(ap, char *);
+ void *result = va_arg(ap, void *);
+ char *p;
+
+ if (asprintf(&path, "%s/%s", dir, name) == -1)
+ {
+ warn("allocation error in xs_gather!\n");
+ ret = ENOMEM;
+ break;
+ }
+ p = xs_read(xs, path, NULL);
+ free(path);
+ if (p == NULL) {
+ ret = ENOENT;
+ break;
+ }
+ if (fmt) {
+ if (sscanf(p, fmt, result) == 0)
+ ret = EINVAL;
+ free(p);
+ } else
+ *(char **)result = p;
+ }
+ va_end(ap);
+ return ret;
+}
+
+/* Single printf and write: returns -errno or 0. */
+int xs_printf(struct xs_handle *h, const char *dir, const char *node,
+ const char *fmt, ...)
+{
+ char *buf, *path;
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = vasprintf(&buf, fmt, ap);
+ va_end(ap);
+
+ asprintf(&path, "%s/%s", dir, node);
+
+ if ((path == NULL) || (buf == NULL))
+ return 0;
+
+ ret = xs_write(h, path, buf, strlen(buf)+1, O_CREAT);
+
+ free(buf);
+ free(path);
+
+ return ret;
+}
+
+
+int xs_exists(struct xs_handle *h, const char *path)
+{
+ char **d;
+ int num;
+
+ d = xs_directory(h, path, &num);
+ if (d == NULL)
+ return 0;
+ free(d);
+ return 1;
+}
+
+
+
+/* This assumes that the domain name we are looking for is unique! */
+char *get_dom_uuid(struct xs_handle *h, const char *name)
+{
+ char **e, *val, *uuid = NULL;
+ int num, i, len;
+ char *path;
+
+ e = xs_directory(h, "/domain", &num);
+
+ i=0;
+ while (i < num) {
+ asprintf(&path, "/domain/%s/name", e[i]);
+ val = xs_read(h, path, &len);
+ free(path);
+ if (val == NULL)
+ continue;
+ if (strcmp(val, name) == 0) {
+ /* match! */
+ asprintf(&path, "/domain/%s/uuid", e[i]);
+ uuid = xs_read(h, path, &len);
+ free(val);
+ free(path);
+ break;
+ }
+ free(val);
+ i++;
+ }
+
+ free(e);
+ return uuid;
+}
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; str[i]; i++)
+ if (str[i] == c) {
+ if (len == 0)
+ return i;
+ len--;
+ }
+ return (len == 0) ? i : -ERANGE;
+}
+
+
+/* xenbus watches: */
+/* Register callback to watch this node. */
+struct xenbus_watch
+{
+ struct list_head list;
+ char *node;
+ void (*callback)(struct xs_handle *h,
+ struct xenbus_watch *,
+ const char *node);
+};
+
+static LIST_HEAD(watches);
+
+/* A little paranoia: we don't just trust token. */
+static struct xenbus_watch *find_watch(const char *token)
+{
+ struct xenbus_watch *i, *cmp;
+
+ cmp = (void *)strtoul(token, NULL, 16);
+
+ list_for_each_entry(i, &watches, list)
+ if (i == cmp)
+ return i;
+ return NULL;
+}
+
+/* Register callback to watch this node. like xs_watch, return 0 on failure */
+int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+ /* Pointer in ascii is the token. */
+ char token[sizeof(watch) * 2 + 1];
+ int er;
+
+ sprintf(token, "%lX", (long)watch);
+ if (find_watch(token))
+ {
+ warn("watch collision!");
+ return -EINVAL;
+ }
+
+ er = xs_watch(h, watch->node, token);
+ if (er != 0) {
+ list_add(&watch->list, &watches);
+ }
+
+ return er;
+}
+
+int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch)
+{
+ char token[sizeof(watch) * 2 + 1];
+ int er;
+
+ sprintf(token, "%lX", (long)watch);
+ if (!find_watch(token))
+ {
+ warn("no such watch!");
+ return -EINVAL;
+ }
+
+
+ er = xs_unwatch(h, watch->node, token);
+ list_del(&watch->list);
+
+ if (er == 0)
+ warn("XENBUS Failed to release watch %s: %i",
+ watch->node, er);
+ return 0;
+}
+
+/* Re-register callbacks to all watches. */
+void reregister_xenbus_watches(struct xs_handle *h)
+{
+ struct xenbus_watch *watch;
+ char token[sizeof(watch) * 2 + 1];
+
+ list_for_each_entry(watch, &watches, list) {
+ sprintf(token, "%lX", (long)watch);
+ xs_watch(h, watch->node, token);
+ }
+}
+
+/* based on watch_thread() */
+int xs_fire_next_watch(struct xs_handle *h)
+{
+ char **res;
+ char *token;
+ char *node = NULL;
+ struct xenbus_watch *w;
+ int er;
+
+ res = xs_read_watch(h);
+ if (res == NULL)
+ return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */
+
+ node = res[0];
+ token = res[1];
+
+ er = xs_acknowledge_watch(h, token);
+ if (er == 0)
+ warn("Couldn't acknowledge watch (%s)", token);
+
+ w = find_watch(token);
+ if (!w)
+ {
+ warn("unregistered watch fired");
+ goto done;
+ }
+ w->callback(h, w, node);
+
+ done:
+ free(res);
+ return 1;
+}
+
+
+
+
+/* ---------------------------------------------------------------------- */
+
+struct backend_info
+{
+ /* our communications channel */
+ blkif_t *blkif;
+
+ long int frontend_id;
+ long int pdev;
+ long int readonly;
+
+ /* watch back end for changes */
+ struct xenbus_watch backend_watch;
+ char *backpath;
+
+ /* watch front end for changes */
+ struct xenbus_watch watch;
+ char *frontpath;
+
+ struct list_head list;
+};
+
+static LIST_HEAD(belist);
+
+static struct backend_info *be_lookup_be(const char *bepath)
+{
+ struct backend_info *be;
+
+ list_for_each_entry(be, &belist, list)
+ if (strcmp(bepath, be->backpath) == 0)
+ return be;
+ return (struct backend_info *)NULL;
+}
+
+static int be_exists_be(const char *bepath)
+{
+ return ( be_lookup_be(bepath) != NULL );
+}
+
+static struct backend_info *be_lookup_fe(const char *fepath)
+{
+ struct backend_info *be;
+
+ list_for_each_entry(be, &belist, list)
+ if (strcmp(fepath, be->frontpath) == 0)
+ return be;
+ return (struct backend_info *)NULL;
+}
+
+static int backend_remove(struct xs_handle *h, struct backend_info *be)
+{
+ /* Turn off watches. */
+ if (be->watch.node)
+ unregister_xenbus_watch(h, &be->watch);
+ if (be->backend_watch.node)
+ unregister_xenbus_watch(h, &be->backend_watch);
+
+ /* Unhook from be list. */
+ list_del(&be->list);
+
+ /* Free everything else. */
+ if (be->blkif)
+ free_blkif(be->blkif);
+ if (be->frontpath)
+ free(be->frontpath);
+ if (be->backpath)
+ free(be->backpath);
+ free(be);
+ return 0;
+}
+
+static void frontend_changed(struct xs_handle *h, struct xenbus_watch *w,
+ const char *fepath_im)
+{
+ struct backend_info *be;
+ char *fepath = NULL;
+ int er;
+
+ be = be_lookup_fe(w->node);
+ if (be == NULL)
+ {
+ warn("frontend changed called for nonexistent backend! (%s)", fepath);
+ goto fail;
+ }
+
+ /* If other end is gone, delete ourself. */
+ if (w->node && !xs_exists(h, be->frontpath)) {
+ DPRINTF("DELETING BE: %s\n", be->backpath);
+ backend_remove(h, be);
+ return;
+ }
+
+ if (be->blkif == NULL || (be->blkif->state == CONNECTED))
+ return;
+
+ /* Supply the information about the device the frontend needs */
+ er = xs_transaction_start(h, be->backpath);
+ if (er == 0) {
+ warn("starting transaction");
+ goto fail;
+ }
+
+ er = xs_printf(h, be->backpath, "sectors", "%lu",
+ be->blkif->ops->get_size(be->blkif));
+ if (er == 0) {
+ warn("writing sectors");
+ goto fail;
+ }
+
+ er = xs_printf(h, be->backpath, "info", "%u",
+ be->blkif->ops->get_info(be->blkif));
+ if (er == 0) {
+ warn("writing info");
+ goto fail;
+ }
+
+ er = xs_printf(h, be->backpath, "sector-size", "%lu",
+ be->blkif->ops->get_secsize(be->blkif));
+ if (er == 0) {
+ warn("writing sector-size");
+ goto fail;
+ }
+
+ be->blkif->state = CONNECTED;
+
+ xs_transaction_end(h, 0);
+
+ return;
+
+ fail:
+ if (fepath)
+ free(fepath);
+}
+
+
+static void backend_changed(struct xs_handle *h, struct xenbus_watch *w,
+ const char *bepath_im)
+{
+ struct backend_info *be;
+ char *path = NULL, *p;
+ int len, er;
+ long int pdev = 0, handle;
+
+ be = be_lookup_be(w->node);
+ if (be == NULL)
+ {
+ warn("backend changed called for nonexistent backend! (%s)", w->node);
+ goto fail;
+ }
+
+ er = xs_gather(h, be->backpath, "physical-device", "%li", &pdev, NULL);
+ if (er != 0)
+ goto fail;
+
+ if (be->pdev && be->pdev != pdev) {
+ warn("changing physical-device not supported");
+ goto fail;
+ }
+ be->pdev = pdev;
+
+ asprintf(&path, "%s/%s", w->node, "read-only");
+ if (xs_exists(h, path))
+ be->readonly = 1;
+
+ if (be->blkif == NULL) {
+ /* Front end dir is a number, which is used as the handle. */
+ p = strrchr(be->frontpath, '/') + 1;
+ handle = strtoul(p, NULL, 0);
+
+ be->blkif = alloc_blkif(be->frontend_id);
+ if (be->blkif == NULL)
+ goto fail;
+
+ er = blkif_init(be->blkif, handle, be->pdev, be->readonly);
+ if (er)
+ goto fail;
+
+ DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", w->node);
+
+ /* Pass in NULL node to skip exist test. */
+ frontend_changed(h, &be->watch, NULL);
+ }
+
+ fail:
+ if (path)
+ free(path);
+
+}
+
+static void blkback_probe(struct xs_handle *h, struct xenbus_watch *w,
+ const char *bepath_im)
+{
+ struct backend_info *be = NULL;
+ char *frontend = NULL, *bepath = NULL;
+ int er, len;
+
+ bepath = strdup(bepath_im);
+ if (!bepath)
+ return;
+ len = strsep_len(bepath, '/', 6);
+ if (len < 0)
+ goto free_be;
+
+ bepath[len] = '\0'; /*truncate the passed-in string with predjudice. */
+
+ be = malloc(sizeof(*be));
+ if (!be) {
+ warn("allocating backend structure");
+ goto free_be;
+ }
+ memset(be, 0, sizeof(*be));
+
+ frontend = NULL;
+ er = xs_gather(h, bepath,
+ "frontend-id", "%li", &be->frontend_id,
+ "frontend", NULL, &frontend,
+ NULL);
+ if (er)
+ goto free_be;
+
+ if (strlen(frontend) == 0 || !xs_exists(h, frontend)) {
+ /* If we can't get a frontend path and a frontend-id,
+ * then our bus-id is no longer valid and we need to
+ * destroy the backend device.
+ */
+ DPRINTF("No frontend (%s)\n", frontend);
+ goto free_be;
+ }
+
+ /* Are we already tracking this device? */
+ if (be_exists_be(bepath))
+ goto free_be;
+
+ be->backpath = bepath;
+ be->backend_watch.node = be->backpath;
+ be->backend_watch.callback = backend_changed;
+ er = register_xenbus_watch(h, &be->backend_watch);
+ if (er == 0) {
+ be->backend_watch.node = NULL;
+ warn("error adding backend watch on %s", bepath);
+ goto free_be;
+ }
+
+ be->frontpath = frontend;
+ be->watch.node = be->frontpath;
+ be->watch.callback = frontend_changed;
+ er = register_xenbus_watch(h, &be->watch);
+ if (er == 0) {
+ be->watch.node = NULL;
+ warn("adding frontend watch on %s", be->frontpath);
+ goto free_be;
+ }
+
+ list_add(&be->list, &belist);
+
+ DPRINTF("[PROBE]: ADDED NEW DEVICE (%s)\n", bepath_im);
+
+ backend_changed(h, &be->backend_watch, bepath);
+ return;
+
+ free_be:
+ if ((be) && (be->backend_watch.node))
+ unregister_xenbus_watch(h, &be->backend_watch);
+ if (frontend)
+ free(frontend);
+ if (bepath)
+ free(bepath);
+ free(be);
+ return;
+}
+
+
+int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname)
+{
+ char *uuid, *path;
+ struct xenbus_watch *vbd_watch;
+ int er;
+
+ uuid = get_dom_uuid(h, domname);
+
+ DPRINTF("%s: %s\n", domname, (uuid != NULL) ? uuid : "[ not found! ]");
+
+ asprintf(&path, "/domain/%s/backend/vbd", uuid);
+ if (path == NULL)
+ return -ENOMEM;
+
+ vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch));
+ vbd_watch->node = path;
+ vbd_watch->callback = blkback_probe;
+ er = register_xenbus_watch(h, vbd_watch);
+ if (er == 0) {
+ warn("Error adding vbd probe watch %s", path);
+ return -EINVAL;
+ }
+
+ return 0;
+}